Beispiel #1
0
/*!
    Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine
    to the region of this buffer defined by \a rect and \a bufferBytesPerLine.
    Returns true if the write was successful; false otherwise.

    This function will queue the request and return immediately.
    Returns an event object that can be used to wait for the
    request to finish.

    The request will not start until all of the events in \a after
    have been signaled as finished.  The request is executed on
    the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa writeRect(), readRectAsync()
*/
QCLEvent QCLBuffer::writeRectAsync
(const QRect &rect, const void *data,
 size_t bufferBytesPerLine, size_t hostBytesPerLine,
 const QCLEventList &after)
{
#ifdef QT_OPENCL_1_1
    size_t bufferOrigin[3] = {rect.x(), rect.y(), 0};
    size_t bufferRegion[3] = {rect.width(), rect.height(), 1};
    static size_t const hostOrigin[3] = {0, 0, 0};
    cl_event event;
    cl_int error = clEnqueueWriteBufferRect
                   (context()->activeQueue(), memoryId(),
                    CL_FALSE, bufferOrigin, hostOrigin, bufferRegion,
                    bufferBytesPerLine, 0, hostBytesPerLine, 0, data,
                    after.size(), after.eventData(), &event);
    context()->reportError("QCLBuffer::writeRectAsync:", error);
    if (error != CL_SUCCESS)
        return QCLEvent();
    else
        return QCLEvent(event);
#else
    context()->reportError("QCLBuffer::writeRectAsync:", CL_INVALID_OPERATION);
    Q_UNUSED(rect);
    Q_UNUSED(data);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(hostBytesPerLine);
    Q_UNUSED(after);
    return QCLEvent();
#endif
}
Beispiel #2
0
/*!
    Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine,
    and a slice pitch of \a hostBytesPerSlice, to the 3D region defined
    by \a origin, \a size, \a bufferBytesPerLine, and \a bufferBytesPerSlice
    in this buffer.

    This function will queue the request and return immediately.
    Returns an event object that can be used to wait for the
    request to finish.

    The request will not start until all of the events in \a after
    have been signaled as finished.  The request is executed on
    the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa writeRect(), readRectAsync()
*/
QCLEvent QCLBuffer::writeRectAsync
(const size_t origin[3], const size_t size[3], const void *data,
 size_t bufferBytesPerLine, size_t bufferBytesPerSlice,
 size_t hostBytesPerLine, size_t hostBytesPerSlice,
 const QCLEventList &after)
{
#ifdef QT_OPENCL_1_1
    static size_t const hostOrigin[3] = {0, 0, 0};
    cl_event event;
    cl_int error = clEnqueueWriteBufferRect
                   (context()->activeQueue(), memoryId(),
                    CL_FALSE, origin, hostOrigin, size,
                    bufferBytesPerLine, bufferBytesPerSlice,
                    hostBytesPerLine, hostBytesPerSlice, data,
                    after.size(), after.eventData(), &event);
    context()->reportError("QCLBuffer::writeRectAsync(3D):", error);
    if (error != CL_SUCCESS)
        return QCLEvent();
    else
        return QCLEvent(event);
#else
    context()->reportError("QCLBuffer::writeRectAsync(3D):", CL_INVALID_OPERATION);
    Q_UNUSED(origin);
    Q_UNUSED(size);
    Q_UNUSED(data);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(bufferBytesPerSlice);
    Q_UNUSED(hostBytesPerLine);
    Q_UNUSED(hostBytesPerSlice);
    Q_UNUSED(after);
    return QCLEvent();
#endif
}
Beispiel #3
0
/*!
    Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine,
    and a slice pitch of \a hostBytesPerSlice, to the 3D region defined
    by \a origin, \a size, \a bufferBytesPerLine, and \a bufferBytesPerSlice
    in this buffer.  Returns true if the write was successful; false otherwise.

    This function will block until the request finishes.
    The request is executed on the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa writeRectAsync(), readRect()
*/
bool QCLBuffer::writeRect
(const size_t origin[3], const size_t size[3], const void *data,
 size_t bufferBytesPerLine, size_t bufferBytesPerSlice,
 size_t hostBytesPerLine, size_t hostBytesPerSlice)
{
#ifdef QT_OPENCL_1_1
    static size_t const hostOrigin[3] = {0, 0, 0};
    cl_int error = clEnqueueWriteBufferRect
                   (context()->activeQueue(), memoryId(),
                    CL_TRUE, origin, hostOrigin, size,
                    bufferBytesPerLine, bufferBytesPerSlice,
                    hostBytesPerLine, hostBytesPerSlice, data, 0, 0, 0);
    context()->reportError("QCLBuffer::writeRect(3D):", error);
    return error == CL_SUCCESS;
#else
    context()->reportError("QCLBuffer::writeRect(3D):", CL_INVALID_OPERATION);
    Q_UNUSED(origin);
    Q_UNUSED(size);
    Q_UNUSED(data);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(bufferBytesPerSlice);
    Q_UNUSED(hostBytesPerLine);
    Q_UNUSED(hostBytesPerSlice);
    return false;
#endif
}
Beispiel #4
0
      /* H2D */
      void copyPitched(cl_mem dst,
                       const void* src,
                       cl_command_queue queue,
                       size_t w,
                       size_t h,
                       size_t pitch) const
      {
        const size_t offset[3] = {0, 0, 0};
        size_t region[3] = {w, h, 1};
        CHECK_CL(clEnqueueWriteBufferRect( queue,
                                           dst,
                                           Blocking<T_Async>::value,
                                           offset, // buffer origin
                                           offset, // host origin
                                           region,
                                           pitch, // buffer row pitch
                                           0, // buffer slice pitch
                                           0, // host row pitch
                                           0, // host slice pitch
                                           src,
                                           0, // num_events_in_wait_list
                                           nullptr, // event_wait_list
                                           nullptr )); // event

      }
Beispiel #5
0
void DeviceMatrixCL3D_copyToDevice(DeviceMatrixCL3D& self, const float* data)
{
    if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) {
		const int mem_size = self.dim_y *self.dim_t * self.pitch_y;
		TheContext * tc = new TheContext();
		
		size_t buffer_origin[3] = {0,0,0};
		size_t host_origin[3] = {0,0,0};	
		size_t region[3] = {
			self.dim_x * sizeof(float),
			self.dim_y,
			self.dim_t};
		
		int err = clEnqueueWriteBufferRect(
			tc->getMyContext()->cqCommandQueue,
			self.dataMatrix, CL_TRUE,
			buffer_origin, host_origin, region,
			self.pitch_y, 0,
			sizeof(float) * self.dim_x, 0,
			data,
			0, NULL, NULL);
		
		if (err != 0){
			std::cout << "Error in copyToDevice (CODE: " << err << ")" << std::endl;
		}
    }
}
    magma_err_t
magma_zsetmatrix_trace(
        magma_int_t m, magma_int_t n,
        magmaDoubleComplex const* hA_src, size_t hA_offset, magma_int_t ldha,
        magmaDoubleComplex_ptr    dA_dst, size_t dA_offset, magma_int_t ldda,
        magma_queue_t queue, magma_event_t *event )
{
    size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t host_orig[3]     = { hA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t region[3]        = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueWriteBufferRect(
            queue, dA_dst, CL_TRUE,  // blocking
            buffer_origin, host_orig, region,
            ldda*sizeof(magmaDoubleComplex), 0,
            ldha*sizeof(magmaDoubleComplex), 0,
            hA_src, 0, NULL, event );
    return err;
}
Beispiel #7
0
//// ========================================
//// copying sub-matrices (contiguous columns)
//// OpenCL takes queue even for blocking transfers, oddly.
magma_err_t
magma_ssetmatrix(
    magma_int_t m, magma_int_t n,
    float const* hA_src, size_t hA_offset, magma_int_t ldha,
    magmaFloat_ptr    dA_dst, size_t dA_offset, magma_int_t ldda,
    magma_queue_t queue )
{
    size_t buffer_origin[3] = { dA_offset*sizeof(float), 0, 0 };
    size_t host_orig[3]     = { 0, 0, 0 };
    size_t region[3]        = { m*sizeof(float), n, 1 };
    cl_int err = clEnqueueWriteBufferRect(
        queue, dA_dst, CL_TRUE,  // blocking
        buffer_origin, host_orig, region,
        ldda*sizeof(float), 0,
        ldha*sizeof(float), 0,
        hA_src, 0, NULL, NULL );
    return err;
}
Beispiel #8
0
// ========================================
// copying sub-matrices (contiguous columns)
// OpenCL takes queue even for blocking transfers, oddly.
extern "C" void
magma_zsetmatrix(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex const* hA_src,                   magma_int_t ldha,
    magmaDoubleComplex_ptr    dB_dst, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue )
{
    if (m <= 0 || n <= 0)
        return;

    size_t buffer_origin[3] = { dB_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t host_orig[3]     = { 0, 0, 0 };
    size_t region[3]        = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueWriteBufferRect(
        queue, dB_dst, CL_TRUE,  // blocking
        buffer_origin, host_orig, region,
        lddb*sizeof(magmaDoubleComplex), 0,
        ldha*sizeof(magmaDoubleComplex), 0,
        hA_src, 0, NULL, g_event );
    check_error( err );
}
Beispiel #9
0
// --------------------
magma_err_t
magma_zsetmatrix_async(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex const* hA_src, size_t hA_offset, magma_int_t ldha,
    magmaDoubleComplex_ptr    dA_dst, size_t dA_offset, magma_int_t ldda,
    magma_queue_t queue, magma_event_t *event )
{
    if ( m<=0 || n <= 0 )
       return MAGMA_SUCCESS;

    size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t host_orig[3]     = { hA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t region[3]        = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueWriteBufferRect(
        queue, dA_dst, CL_FALSE,  // non-blocking
        buffer_origin, host_orig, region,
        ldda*sizeof(magmaDoubleComplex), 0,
        ldha*sizeof(magmaDoubleComplex), 0,
        hA_src, 0, NULL, event );
    clFlush(queue);
    return err;
}
Beispiel #10
0
// --------------------
extern "C" void
magma_csetmatrix_async(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex const* hA_src,                   magma_int_t ldha,
    magmaFloatComplex_ptr    dB_dst, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue, magma_event_t *event )
{
    if (m <= 0 || n <= 0)
        return;

    size_t buffer_origin[3] = { dB_offset*sizeof(magmaFloatComplex), 0, 0 };
    size_t host_orig[3]     = { 0, 0, 0 };
    size_t region[3]        = { m*sizeof(magmaFloatComplex), n, 1 };
    cl_int err = clEnqueueWriteBufferRect(
        queue, dB_dst, CL_FALSE,  // non-blocking
        buffer_origin, host_orig, region,
        lddb*sizeof(magmaFloatComplex), 0,
        ldha*sizeof(magmaFloatComplex), 0,
        hA_src, 0, NULL, event );
    clFlush(queue);
    check_error( err );
}
Beispiel #11
0
/*!
    Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine
    to the region of this buffer defined by \a rect and \a bufferBytesPerLine.
    Returns true if the write was successful; false otherwise.

    This function will block until the request finishes.
    The request is executed on the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa writeRectAsync(), readRect()
*/
bool QCLBuffer::writeRect
(const QRect &rect, const void *data,
 size_t bufferBytesPerLine, size_t hostBytesPerLine)
{
#ifdef QT_OPENCL_1_1
    size_t bufferOrigin[3] = {rect.x(), rect.y(), 0};
    size_t bufferRegion[3] = {rect.width(), rect.height(), 1};
    static size_t const hostOrigin[3] = {0, 0, 0};
    cl_int error = clEnqueueWriteBufferRect
                   (context()->activeQueue(), memoryId(),
                    CL_TRUE, bufferOrigin, hostOrigin, bufferRegion,
                    bufferBytesPerLine, 0, hostBytesPerLine, 0,
                    data, 0, 0, 0);
    context()->reportError("QCLBuffer::writeRect:", error);
    return error == CL_SUCCESS;
#else
    context()->reportError("QCLBuffer::writeRect:", CL_INVALID_OPERATION);
    Q_UNUSED(rect);
    Q_UNUSED(data);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(hostBytesPerLine);
    return false;
#endif
}
Beispiel #12
0
    /// Enqueues a command to write a rectangular region from host memory
    /// to \p buffer.
    ///
    /// \see_opencl_ref{clEnqueueWriteBufferRect}
    ///
    /// \opencl_version_warning{1,1}
    void enqueue_write_buffer_rect(const buffer &buffer,
                                   const size_t buffer_origin[3],
                                   const size_t host_origin[3],
                                   const size_t region[3],
                                   size_t buffer_row_pitch,
                                   size_t buffer_slice_pitch,
                                   size_t host_row_pitch,
                                   size_t host_slice_pitch,
                                   void *host_ptr,
                                   const wait_list &events = wait_list())
    {
        BOOST_ASSERT(m_queue != 0);
        BOOST_ASSERT(buffer.get_context() == this->get_context());
        BOOST_ASSERT(host_ptr != 0);

        cl_int ret = clEnqueueWriteBufferRect(
            m_queue,
            buffer.get(),
            CL_TRUE,
            buffer_origin,
            host_origin,
            region,
            buffer_row_pitch,
            buffer_slice_pitch,
            host_row_pitch,
            host_slice_pitch,
            host_ptr,
            events.size(),
            events.get_event_ptr(),
            0
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }
    }
void b2CLCommonData::CopyJoints(b2World *m_pWorld, bool warmStarting, bool isJointChanged, bool isJointUpdated) 
{
	if (!isJointChanged && !isJointUpdated)
		return;

	if (isJointChanged)
	{
		if (warmStarting)
			StoreJointImpulses();
		lastNumTotalJoints = numTotalJoints;

		DeleteJoints();

		numTotalJoints = m_pWorld->m_jointCount;
	}	

	if (numTotalJoints == 0)
		return;

	b2Joint* pJoint = NULL;
	if (isJointChanged)
	{
		pJoint = m_pWorld->m_jointList; 
		for (int32 i = 0; i < numTotalJoints; ++i)
		{
			assert(pJoint->GetType() < numJointTypes);
			++numJoints[pJoint->GetType()];
			pJoint = pJoint->GetNext(); 
		}

		jointListData = new b2clJoint [numTotalJoints];
		memset(jointListData, 0, sizeof(b2clJoint) * numTotalJoints);
		jointListBuffer = b2CLDevice::instance().allocateArray(sizeof(b2clJoint) * numTotalJoints);
	}

	int32 jointIndex[numJointTypes];
	int32 currentIndex = 0;
	for (int32 i = 0; i < numJointTypes; ++i)
	{
		jointIndex[i] = currentIndex;
		jointColorOffsets[i][0] = currentIndex;

		currentIndex += numJoints[i];
	}

	pJoint = m_pWorld->m_jointList;
	for (int32 i = 0; i < m_pWorld->m_jointCount; ++i)
	{
		int32 jointType = pJoint->GetType();
		assert(copyJointFunc[jointType]);
		(*copyJointFunc[jointType])(pJoint, &jointListData[jointIndex[jointType]++]);
		pJoint = pJoint->GetNext(); 
	}
	ComputeJointColors(m_pWorld);

	if (isJointChanged)
	{
		b2CLDevice::instance().copyArrayToDevice(this->jointListBuffer, jointListData, 0, sizeof(b2clJoint) * numTotalJoints, true);

		if (warmStarting)
			ReadLastJointImpulses();
	}
	else
	{
		// copy data without indices and impulses
		const size_t origin[] = {sizeof(b2clJointImpulseNode), 0, 0};
		const size_t region[] = {sizeof(b2clJoint) - sizeof(b2clJointImpulseNode), numTotalJoints, 1};
		cl_int ciErrNum = clEnqueueWriteBufferRect(b2CLDevice::instance().GetCommandQueue(), jointListBuffer, CL_TRUE, origin, origin, region,
			sizeof(b2clJoint), 0, sizeof(b2clJoint), 0, jointListData, 0, NULL, NULL);
		b2clCheckError(ciErrNum, CL_SUCCESS);
	}
}
Beispiel #14
0
	void roundtrip_func_rect()
	{
	timer.Start(timer_id);
		cl_int err;
		//rect
		size_t a_buffer_origin[3] = {0,0,0}; 
		size_t a_host_origin[3] = {0,0,0};
		size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
		size_t a_buffer_row_pitch=0*sizeof(T);//lda
		size_t a_buffer_slice_pitch=0;
		size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
		size_t a_host_slice_pitch=0;

		size_t b_buffer_origin[3] = {0,0,0}; 
		size_t b_host_origin[3] = {0,0,0};
		size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
		size_t b_buffer_row_pitch=0*sizeof(T);//ldb
		size_t b_buffer_slice_pitch=0;
		size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
		size_t b_host_slice_pitch=0;

		size_t c_buffer_origin[3] = {0,0,0}; 
		size_t c_host_origin[3] = {0,0,0};
		size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
		size_t c_buffer_row_pitch=0*sizeof(T);//ldc
		size_t c_buffer_slice_pitch=0;
		size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
		size_t c_host_slice_pitch=0;

        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                       (buffer_.k_*buffer_.m_ +
                                           buffer_.offA_) * sizeof(T),
                                       NULL, &err);

        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.k_ * buffer_.n_ +
                                            buffer_.offB_) * sizeof(T),
                                        NULL, &err);

        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                        (buffer_.m_ * buffer_.n_ +
                                            buffer_.offC_) * sizeof(T),
                                        NULL, &err);
        /*
		err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                   buffer_.offA_ * sizeof(T),
                                   buffer_.lda_ * buffer_.a_num_vectors_ *
                                       sizeof(T),
                                   buffer_.a_, 0, NULL, NULL);
		
        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
                                   buffer_.offB_ * sizeof(T),
                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
                                       sizeof(T),
                                   buffer_.b_, 0, NULL, NULL);

        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
                                   buffer_.offC_ * sizeof(T),
                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
                                   sizeof(T),
                                   buffer_.c_, 0, NULL, NULL);*/
        err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch,
										a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
        err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch,
										b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
        err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);

		if(buffer_.trans_a_==clblasNoTrans)
		{
			buffer_.lda_=buffer_.m_;
		}
		else
		{
			buffer_.lda_=buffer_.k_;
		}
		if(buffer_.trans_b_==clblasNoTrans)
		{
			buffer_.ldb_=buffer_.k_;
		}
		else
		{
			buffer_.ldb_=buffer_.n_;
		}
		buffer_.ldc_=buffer_.m_;
		xGemm_Function(false);
		/*
		err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
                                       sizeof(T),
								  buffer_.c_, 0, NULL, &event_);
		*/
		err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
		clWaitForEvents(1, &event_);
	timer.Stop(timer_id);
	}	
Beispiel #15
0
WEAK int halide_copy_to_dev(void *user_context, buffer_t* buf) {
    int err = halide_dev_malloc(user_context, buf);
    if (err) {
        return err;
    }

    DEBUG_PRINTF(user_context, "CL: halide_copy_to_dev (user_context: %p, buf: %p)\n", user_context, buf );

    // Acquire the context so we can use the command queue. This also avoids multiple
    // redundant calls to clEnqueueWriteBuffer when multiple threads are trying to copy
    // the same buffer.
    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    if (buf->host_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->host && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_host_to_dev_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
#ifdef ENABLE_OPENCL_11
                // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we
                // can deal with the 2 innermost strides with OpenCL.
                uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3];

                size_t offset[3] = { off, 0, 0 };
                size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] };

                DEBUG_PRINTF( user_context, "    clEnqueueWriteBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n",
                              z, w,
                              (void *)c.src, c.dst, (int)off,
                              (int)region[0], (int)region[1], (int)region[2],
                              (int)c.stride_bytes[0], (int)c.stride_bytes[1]);

                cl_int err = clEnqueueWriteBufferRect(ctx.cmd_queue, (cl_mem)c.dst, CL_FALSE,
                                                      offset, offset, region,
                                                      c.stride_bytes[0], c.stride_bytes[1],
                                                      c.stride_bytes[0], c.stride_bytes[1],
                                                      (void *)c.src,
                                                      0, NULL, NULL);

                if (err != CL_SUCCESS) {
                    halide_error_varargs(user_context, "CL: clEnqueueWriteBufferRect failed (%d)\n", err);
                    return err;
                }
#else
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        void *src = (void *)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;

                        DEBUG_PRINTF( user_context, "    clEnqueueWriteBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n",
                                      x, y, z, w,
                                      (long long)size, src, (void *)dst );
                        cl_int err = clEnqueueWriteBuffer(ctx.cmd_queue, (cl_mem)c.dst,
                                                          CL_FALSE, off, size, src, 0, NULL, NULL);
                        if (err != CL_SUCCESS) {
                            halide_error_varargs(user_context, "CL: clEnqueueWriteBuffer failed (%d)\n", err);
                            return err;
                        }
                    }
                }
#endif
            }
        }
        // The writes above are all non-blocking, so empty the command
        // queue before we proceed so that other host code won't write
        // to the buffer while the above writes are still running.
        clFinish(ctx.cmd_queue);

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->host_dirty = false;
    return 0;
}
Beispiel #16
0
int main(int argc, char** argv) {

   // Set up the data on the host	
   clock_t start, start0;
   start0 = clock();
   start = clock();
   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";



   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth, 
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Pad the number of columns 
#ifdef NON_OPTIMIZED
   int deviceWidth = imageWidth;
#else  // READ_ALIGNED || READ4
   int deviceWidth = roundUp(imageWidth, WGX);
#endif
   int deviceHeight = imageHeight;
   // Size of the input and output images on the device
   int deviceDataSize = imageHeight*deviceWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);
   int i, j;
   for(i = 0; i < imageHeight; i++) {
       for(j = 0; j < imageWidth; j++) {
           outputImage[i*imageWidth+j] = 0;
       }
   }

   // 45 degree motion blur
   float filter[49] = 
      {0,      0,      0,      0,      0, 0.0145,      0,
       0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
       0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
       0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
       0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
  0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
       0, 0.0145,      0,      0,      0,      0,      0};
 
   int filterWidth = 7;
   int paddingPixels = (int)(filterWidth/2) * 2; 
   stoptime(start, "set up input, output.");
   start = clock();
   // Set up the OpenCL environment

   // Discovery platform
   cl_platform_id platform;
   clGetPlatformIDs(1, &platform, NULL);

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device,
      NULL);

    size_t time_res;
    clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
            sizeof(time_res), &time_res, NULL);
    printf("Device profiling timer resolution: %zu ns.\n", time_res);

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 
       (cl_context_properties)(platform), 0};
   cl_context context; 
   context = clCreateContext(props, 1, &device, NULL, NULL, 
      NULL);

   // Create command queue
   cl_ulong time_start, time_end, exec_time;
   cl_event timing_event;
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

   // Create memory buffers
   cl_mem d_inputImage;
   cl_mem d_outputImage;
   cl_mem d_filter;
   d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       deviceDataSize, NULL, NULL);
   d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
       deviceDataSize, NULL, NULL);
   d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       49*sizeof(float),NULL, NULL);
   
   // Write input data to the device
#ifdef NON_OPTIMIZED
   clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize,
       inputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   size_t buffer_origin[3] = {0,0,0};
   size_t host_origin[3] = {0,0,0};
   size_t region[3] = {deviceWidth*sizeof(float), 
      imageHeight, 1};
   clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0,
      inputImage, 0, NULL, NULL);
#endif
	
   // Write the filter to the device
   clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 
      49*sizeof(float), filter, 0, NULL, NULL);
	
   // Read in the program from file
   char* source = readSource("convolution.cl");

   // Create the program
   cl_program program;
	
   // Create and compile the program
   program = clCreateProgramWithSource(context, 1, 
       (const char**)&source, NULL, NULL);
   cl_int build_status;
   build_status = clBuildProgram(program, 1, &device, NULL, NULL,
      NULL);
      
   // Create the kernel
   cl_kernel kernel;
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   // Only the host-side code differs for the aligned reads
   kernel = clCreateKernel(program, "convolution", NULL);
#else // READ4
   kernel = clCreateKernel(program, "convolution_read4", NULL);
#endif
	
   // Selected work group size is 16x16
   int wgWidth = WGX;
   int wgHeight = WGY;

   // When computing the total number of work items, the 
   // padding work items do not need to be considered
   int totalWorkItemsX = roundUp(imageWidth-paddingPixels, 
      wgWidth);
   int totalWorkItemsY = roundUp(imageHeight-paddingPixels, 
      wgHeight);

   // Size of a work group
   size_t localSize[2] = {wgWidth, wgHeight};
   // Size of the NDRange
   size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY};

   // The amount of local data that is cached is the size of the
   // work groups plus the padding pixels
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   int localWidth = localSize[0] + paddingPixels;
#else // READ4
   // Round the local width up to 4 for the read4 kernel
   int localWidth = roundUp(localSize[0]+paddingPixels, 4);
#endif
   int localHeight = localSize[1] + paddingPixels;

   // Compute the size of local memory (needed for dynamic 
   // allocation)
   size_t localMemSize = (localWidth * localHeight * 
      sizeof(float));

   // Set the kernel arguments
   clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
   clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter);
   clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight);
   clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); 
   clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
   clSetKernelArg(kernel, 6, localMemSize, NULL);
   clSetKernelArg(kernel, 7, sizeof(int), &localHeight); 
   clSetKernelArg(kernel, 8, sizeof(int), &localWidth);

   stoptime(start, "set up kernel");
   start = clock();
   // Execute the kernel
   clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, 
      localSize, 0, NULL, &timing_event);

   // Wait for kernel to complete
   clFinish(queue);
   stoptime(start, "run kernel");
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
   exec_time = time_end-time_start;
   printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000);

   // Read back the output image
#ifdef NON_OPTIMIZED
   clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, 
      deviceDataSize, outputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   // Begin reading output from (3,3) on the device 
   // (for 7x7 filter with radius 3)
   buffer_origin[0] = 3*sizeof(float);
   buffer_origin[1] = 3;
   buffer_origin[2] = 0;

   // Read data into (3,3) on the host
   host_origin[0] = 3*sizeof(float);
   host_origin[1] = 3;
   host_origin[2] = 0;
	
   // Region is image size minus padding pixels
   region[0] = (imageWidth-paddingPixels)*sizeof(float);
   region[1] = (imageHeight-paddingPixels);
   region[2] = 1;
	
	// Perform the read
   clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, 
      outputImage, 0, NULL, NULL);
#endif
  
   // Homegrown function to write the image to file
   storeImage(outputImage, outputFile, imageHeight, 
      imageWidth, inputFile);
   
   // Free OpenCL objects
   clReleaseMemObject(d_inputImage);
   clReleaseMemObject(d_outputImage);
   clReleaseMemObject(d_filter);
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(queue);
   clReleaseContext(context);

   return 0;
}
Beispiel #17
0
/** Thread that receives data from client.
 * @param data struct dataTransfer casted variable.
 * @return NULL
 */
void *asyncDataRecvRect_thread(void *data)
{
    unsigned int i,j,k,n;
    size_t buffsize = BUFF_SIZE*sizeof(char);
    struct dataSend* _data = (struct dataSend*)data;
    size_t host_origin[3] = {0, 0, 0};
    // Receive the data
    int *fd = &(_data->fd);
    Send(fd, &buffsize, sizeof(size_t), 0);
    // Compute the number of packages needed
    n = _data->host_row_pitch / buffsize;
    // Receive the rows
    size_t origin = 0;
    for(j=0;j<_data->region[1];j++){
        for(k=0;k<_data->region[2];k++){
            // Receive package by pieces
            for(i=0;i<n;i++){
                Recv(fd, _data->ptr + i*buffsize + origin, buffsize, MSG_WAITALL);
            }
            if(_data->host_row_pitch % buffsize){
                // Remains some data to arrive
                Recv(fd, _data->ptr + n*buffsize + origin, _data->host_row_pitch % buffsize, MSG_WAITALL);
            }
            // Compute the new origin
            origin += _data->host_row_pitch;
        }
    }
    // We may wait manually for the events provided because
    // OpenCL can only waits their events, but ocland event
    // can be relevant. We will not check for errors,
    // assuming than events can be wrong, but is to late to
    // try to report a fail.
    if(_data->num_events_in_wait_list){
        oclandWaitForEvents(_data->num_events_in_wait_list, _data->event_wait_list);
    }
    // Call to OpenCL
    clEnqueueWriteBufferRect(_data->command_queue,_data->mem,CL_FALSE,
                             _data->buffer_origin,host_origin,_data->region,
                             _data->buffer_row_pitch,_data->buffer_slice_pitch,
                             _data->host_row_pitch,_data->host_slice_pitch,
                             _data->ptr,0,NULL,&(_data->event->event));
    // Wait until data is copied here. We will not test
    // for errors, user can do it later
    clWaitForEvents(1,&(_data->event->event));
    free(_data->buffer_origin); _data->buffer_origin = NULL;
    free(_data->region); _data->region = NULL;
    free(_data->ptr); _data->ptr = NULL;
    if(_data->event){
        _data->event->status = CL_COMPLETE;
    }
    if(_data->want_event != CL_TRUE){
        free(_data->event); _data->event = NULL;
    }
    if(_data->event_wait_list) free(_data->event_wait_list); _data->event_wait_list=NULL;
    // shutdown(fd, 2);
    // shutdown(_data->fd, 2); // Destroy the server to free the port
    close(fd);
    close(_data->fd);
    free(_data); _data=NULL;
    pthread_exit(NULL);
    return NULL;
}
END_TEST

START_TEST (test_read_write_rect)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_mem buf, buf_part;

    // Grid xyz = (5 x 7 x 2)
    unsigned char grid[70] = {
        0, 0, 0, 0, 0,
        0, 1, 1, 1, 0,
        1, 2, 2, 2, 1,
        1, 2, 3, 2, 1,
        1, 2, 2, 2, 1,
        0, 1, 1, 1, 0,
        0, 0, 0, 0, 0,

        0, 0, 1, 0, 0,
        0, 0, 2, 0, 0,
        0, 1, 3, 1, 0,
        0, 2, 3, 2, 0,
        1, 3, 3, 3, 1,
        2, 3, 3, 3, 2,
        3, 3, 3, 3, 3
    };

    // Middle of the "image" : 3 x 3 x 2 centered at (3, 3)
    unsigned char part[18] = {
        2, 2, 2,
        2, 3, 2,
        2, 2, 2,

        1, 3, 1,
        2, 3, 2,
        3, 3, 3
    };

    unsigned char buffer[70], buffer_part[18];
    size_t host_origin[3] = {0, 0, 0};
    size_t buf_origin[3] = {0, 0, 0};
    size_t region[3] = {5, 7, 2};

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                         sizeof(buffer), buffer, &result);
    fail_if(
        result != CL_SUCCESS,
        "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
    );

    buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                              sizeof(buffer_part), buffer_part, &result);
    fail_if(
        result != CL_SUCCESS,
        "cannot create a buffer for the part that will be read"
    );

    // Write grid into buffer
    result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin,
                                      region, 0, 0, 0, 0, grid, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking write buffer rect event with pitches guessed"
    );
    fail_if(
        std::memcmp(buffer, grid, sizeof(buffer)) != 0,
        "buffer doesn't contain the data"
    );

    // Read it back into a temporary region
    buf_origin[0] = 1;
    buf_origin[1] = 2;
    buf_origin[2] = 0;
    // host_origin remains (0, 0, 0)
    region[0] = 3;
    region[1] = 3;
    region[2] = 2;

    result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin,
                                     region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to queue a blocking write buffer rect event with host pitches guessed"
    );
    fail_if(
        std::memcmp(buffer_part, part, sizeof(part)) != 0,
        "the part of the buffer was not correctly read"
    );

    // Clear the temporary region and re-read into it using buf_part
    std::memset(buffer_part, 0, sizeof(buffer_part));
    cl_event event;

    result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin,
                                     host_origin, region, 5, 5*7, 0, 0, 0, 0, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to queue a copy buffer rect event"
    );

    result = clWaitForEvents(1, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to wait for the event"
    );

    fail_if(
        std::memcmp(buffer_part, part, sizeof(part)) != 0,
        "the part of the buffer was not correctly read using a buffer"
    );

    clReleaseEvent(event);
    clReleaseMemObject(buf_part);
    clReleaseMemObject(buf);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}
Beispiel #19
0
int main(int argc, char *argv[])
{
  int error, xsize, ysize, rgb_max;
  int *r, *b, *g;

  float *gray, *congray, *congray_cl;

  // identity kernel
  // float filter[] = {
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,1,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  // };

  // 45 degree motion blur
  float filter[] =
    {0,      0,      0,      0,      0, 0.0145,      0,
     0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
     0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
     0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
     0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
     0, 0.0145,      0,      0,      0,      0,      0};

  // mexican hat kernel
  // float filter[] = {
  //   0, 0,-1,-1,-1, 0, 0,
  //   0,-1,-3,-3,-3,-1, 0,
  //  -1,-3, 0, 7, 0,-3,-1,
  //  -1,-3, 7,24, 7,-3,-1,
  //  -1,-3, 0, 7, 0,-3,-1,
  //   0,-1,-3,-3,-3,-1, 0,
  //   0, 0,-1,-1,-1, 0, 0
  // };


  if(argc != 3)
  {
    fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]);
    abort();
  }

  const char* filename = argv[1];
  const int num_loops = atoi(argv[2]);


  // --------------------------------------------------------------------------
  // load image
  // --------------------------------------------------------------------------
  printf("Reading ``%s''\n", filename);
  ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b);
  printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize);

  // --------------------------------------------------------------------------
  // allocate CPU buffers
  // --------------------------------------------------------------------------
  posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float));
  if(!gray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float));
  if(!congray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float));
  if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); }

  // --------------------------------------------------------------------------
  // convert image to grayscale
  // --------------------------------------------------------------------------
  for(int n = 0; n < xsize*ysize; ++n)
    gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max;

  // --------------------------------------------------------------------------
  // execute filter on cpu
  // --------------------------------------------------------------------------
  for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
  {
    for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
    {
      float sum = 0;
      for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k)
      {
        for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l)
        {
          sum += gray[(i+k)*xsize + (j+l)] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
        }
      }
      congray[i*xsize + j] = sum;
    }
  }

  // --------------------------------------------------------------------------
  // output cpu filtered image
  // --------------------------------------------------------------------------
  printf("Writing cpu filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n)
    r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max);
  error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // get an OpenCL context and queue
  // --------------------------------------------------------------------------
  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);
  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("convolution.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL);
  free(knl_text);

#ifdef NON_OPTIMIZED
  int deviceWidth = xsize;
#else
  int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX;
#endif
  int deviceHeight = ysize;
  size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float);

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
      deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        deviceDataSize, gray, 0, NULL, NULL));
#else
  size_t buffer_origin[3] = {0,0,0};
  size_t host_origin[3] = {0,0,0};
  size_t region[3] = {deviceWidth*sizeof(float), ysize, 1};
  clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE,
                           buffer_origin, host_origin, region,
                           deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
                           gray, 0, NULL, NULL);
#endif

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0,
        FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  cl_int rows = ysize;
  cl_int cols = xsize;
  cl_int filterWidth = FILTER_WIDTH;
  cl_int paddingPixels = 2*HALF_FILTER_WIDTH;

  size_t local_size[] = { WGX, WGY };
  size_t global_size[] = {
    ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0],
    ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1],
  };

  cl_int localWidth = local_size[0] + paddingPixels;
  cl_int localHeight = local_size[1] + paddingPixels;
  size_t localMemSize = localWidth * localHeight * sizeof(float);

  CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
  CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
  CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter));
  CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows));
  CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols));
  CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth));
  CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL));
  CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight));
  CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth));

  // --------------------------------------------------------------------------
  // print kernel info
  // --------------------------------------------------------------------------
  print_kernel_info(queue, knl);

  CALL_CL_SAFE(clFinish(queue));
  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int loop = 0; loop < num_loops; ++loop)
  {
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL,
          global_size, local_size, 0, NULL, NULL));

    // Edit: Copy the blurred image to input buffer
#ifdef NON_OPTIMIZED
    CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0,
          deviceDataSize, 0, NULL, NULL));
#else
    clEnqueueCopyBufferRect(queue, buf_congray, buf_gray,
                            buffer_origin, host_origin, region,
                            deviceWidth*sizeof(float), 0,
                            xsize*sizeof(float), 0,
                            0, NULL, NULL);
#endif
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops;
  printf("%f s\n", elapsed);
  printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed);
  printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed);
  printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH)
	 *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueReadBuffer(
        queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        xsize * ysize * sizeof(float), congray_cl,
        0, NULL, NULL));
#else
  buffer_origin[0] = 3*sizeof(float);
  buffer_origin[1] = 3;
  buffer_origin[2] = 0;

  host_origin[0] = 3*sizeof(float);
  host_origin[1] = 3;
  host_origin[2] = 0;

  region[0] = (xsize-paddingPixels)*sizeof(float);
  region[1] = (ysize-paddingPixels);
  region[2] = 1;

  clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,
      buffer_origin, host_origin, region,
      deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
      congray_cl, 0, NULL, NULL);
#endif

  // --------------------------------------------------------------------------
  // output OpenCL filtered image
  // --------------------------------------------------------------------------
  printf("Writing OpenCL filtered image\n");

  // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect
  for(int n = 0; n < xsize*ysize; ++n) {
    int color = (int)(congray_cl[n] * rgb_max);

    if (color < 0) {
      color = 0;
    } else if (color > 255) {
      color = 255;
    }

    r[n] = g[n] = b[n] = color;
  }
  error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_congray));
  CALL_CL_SAFE(clReleaseMemObject(buf_gray));
  CALL_CL_SAFE(clReleaseMemObject(buf_filter));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));
  free(gray);
  free(congray);
  free(congray_cl);
  free(r);
  free(b);
  free(g);
}