Ejemplo n.º 1
0
/*!
    Copies the contents of \a rect within this buffer to \a dest,
    starting at \a destPoint.  The source and destination line pitch
    values are given by \a bufferBytesPerLine and \a destBytesPerLine
    respectively.

    The request will not start until all of the events in \a after
    have been signaled as finished.  The request is executed on
    the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa copyToRect()
*/
QCLEvent QCLBuffer::copyToRectAsync
(const QRect &rect, const QCLBuffer &dest, const QPoint &destPoint,
 size_t bufferBytesPerLine, size_t destBytesPerLine,
 const QCLEventList &after)
{
#ifdef QT_OPENCL_1_1
    const size_t src_origin[3] = {rect.x(), rect.y(), 0};
    const size_t dst_origin[3] = {destPoint.x(), destPoint.y(), 0};
    const size_t region[3] = {rect.width(), rect.height(), 1};
    cl_event event;
    cl_int error = clEnqueueCopyBufferRect
                   (context()->activeQueue(), memoryId(), dest.memoryId(),
                    src_origin, dst_origin, region,
                    bufferBytesPerLine, 0, destBytesPerLine, 0,
                    after.size(), after.eventData(), &event);
    context()->reportError("QCLBuffer::copyToRectAsync:", error);
    if (error == CL_SUCCESS)
        return QCLEvent(event);
    else
        return QCLEvent();
#else
    context()->reportError("QCLBuffer::copyToRectAsync:", CL_INVALID_OPERATION);
    Q_UNUSED(rect);
    Q_UNUSED(dest);
    Q_UNUSED(destPoint);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(destBytesPerLine);
    Q_UNUSED(after);
    return false;
#endif
}
Ejemplo n.º 2
0
/*!
    Copies the 3D rectangle defined by \a origin and \a size within
    this buffer to \a destOrigin within \a dest.  The source and destination
    pitch values are given by \a bufferBytesPerLine, \a bufferBytesPerSlice,
    \a destBytesPerLine, and \a destBytesPerSlice.

    The request will not start until all of the events in \a after
    have been signaled as finished.  The request is executed on
    the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa copyToRectAsync()
*/
QCLEvent QCLBuffer::copyToRectAsync
(const size_t origin[3], const size_t size[3],
 const QCLBuffer &dest, const size_t destOrigin[3],
 size_t bufferBytesPerLine, size_t bufferBytesPerSlice,
 size_t destBytesPerLine, size_t destBytesPerSlice,
 const QCLEventList &after)
{
#ifdef QT_OPENCL_1_1
    cl_event event;
    cl_int error = clEnqueueCopyBufferRect
                   (context()->activeQueue(), memoryId(), dest.memoryId(),
                    origin, destOrigin, size,
                    bufferBytesPerLine, bufferBytesPerSlice,
                    destBytesPerLine, destBytesPerSlice,
                    after.size(), after.eventData(), &event);
    context()->reportError("QCLBuffer::copyToRectAsync(3D):", error);
    if (error == CL_SUCCESS)
        return QCLEvent(event);
    else
        return QCLEvent();
#else
    context()->reportError("QCLBuffer::copyToRectAsync(3D):", CL_INVALID_OPERATION);
    Q_UNUSED(origin);
    Q_UNUSED(size);
    Q_UNUSED(dest);
    Q_UNUSED(destOrigin);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(bufferBytesPerSlice);
    Q_UNUSED(destBytesPerLine);
    Q_UNUSED(destBytesPerSlice);
    Q_UNUSED(after);
    return false;
#endif
}
Ejemplo n.º 3
0
/*!
    Copies the contents of \a rect within this buffer to \a dest,
    starting at \a destPoint.  The source and destination line pitch
    values are given by \a bufferBytesPerLine and \a destBytesPerLine
    respectively.  Returns true if the copy was successful; false otherwise.

    This function will block until the request finishes.
    The request is executed on the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa copyToRectAsync()
*/
bool QCLBuffer::copyToRect
(const QRect &rect, const QCLBuffer &dest,
 const QPoint &destPoint, size_t bufferBytesPerLine,
 size_t destBytesPerLine)
{
#ifdef QT_OPENCL_1_1
    const size_t src_origin[3] = {rect.x(), rect.y(), 0};
    const size_t dst_origin[3] = {destPoint.x(), destPoint.y(), 0};
    const size_t region[3] = {rect.width(), rect.height(), 1};
    cl_event event;
    cl_int error = clEnqueueCopyBufferRect
                   (context()->activeQueue(), memoryId(), dest.memoryId(),
                    src_origin, dst_origin, region,
                    bufferBytesPerLine, 0, destBytesPerLine, 0, 0, 0, &event);
    context()->reportError("QCLBuffer::copyToRect:", error);
    if (error == CL_SUCCESS) {
        clWaitForEvents(1, &event);
        clReleaseEvent(event);
        return true;
    } else {
        return false;
    }
#else
    context()->reportError("QCLBuffer::copyToRect:", CL_INVALID_OPERATION);
    Q_UNUSED(rect);
    Q_UNUSED(dest);
    Q_UNUSED(destPoint);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(destBytesPerLine);
    return false;
#endif
}
Ejemplo n.º 4
0
/*!
    Copies the 3D rectangle defined by \a origin and \a size within
    this buffer to \a destOrigin within \a dest.  The source and destination
    pitch values are given by \a bufferBytesPerLine, \a bufferBytesPerSlice,
    \a destBytesPerLine, and \a destBytesPerSlice.  Returns true if
    the copy was successful; false otherwise.

    This function will block until the request finishes.
    The request is executed on the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa copyToRectAsync()
*/
bool QCLBuffer::copyToRect
(const size_t origin[3], const size_t size[3],
 const QCLBuffer &dest, const size_t destOrigin[3],
 size_t bufferBytesPerLine, size_t bufferBytesPerSlice,
 size_t destBytesPerLine, size_t destBytesPerSlice)
{
#ifdef QT_OPENCL_1_1
    cl_event event;
    cl_int error = clEnqueueCopyBufferRect
                   (context()->activeQueue(), memoryId(), dest.memoryId(),
                    origin, destOrigin, size,
                    bufferBytesPerLine, bufferBytesPerSlice,
                    destBytesPerLine, destBytesPerSlice, 0, 0, &event);
    context()->reportError("QCLBuffer::copyToRect(3D):", error);
    if (error == CL_SUCCESS) {
        clWaitForEvents(1, &event);
        clReleaseEvent(event);
        return true;
    } else {
        return false;
    }
#else
    context()->reportError("QCLBuffer::copyToRect(3D):", CL_INVALID_OPERATION);
    Q_UNUSED(origin);
    Q_UNUSED(size);
    Q_UNUSED(dest);
    Q_UNUSED(destOrigin);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(bufferBytesPerSlice);
    Q_UNUSED(destBytesPerLine);
    Q_UNUSED(destBytesPerSlice);
    return false;
#endif
}
Ejemplo n.º 5
0
template <typename T> tDeviceRC DWTKernel<T>::copyLLBandToSrc(int LLSizeX, int LLSizeY){
	  // copy forward or reverse transformed LL band from output back into the input
	size_t bufferOffset[] = { 0, 0, 0};
	cl_int err = CL_SUCCESS;

	// The region size must be given in bytes
	size_t region[] = {LLSizeX * sizeof(T), LLSizeY, 1 };
			
	err = clEnqueueCopyBufferRect ( queue, 	//copy command will be queued
				    dstMem,		
					srcMem,		
					bufferOffset,	//offset associated with src_buffer
					bufferOffset,     //offset associated with src_buffer
					region,		//(width, height, depth) in bytes of the 2D or 3D rectangle being copied
					region[0],   //length of each row in bytes
					0, //length of each 2D slice in bytes 
					region[0] ,   //length of each row in bytes
					0, //length of each 2D slice in bytes
					0,
					NULL,
					NULL);
	if (CL_SUCCESS != err)
	{
		LogError("Error: clEnqueueCopyBufferRect (srcMem) returned %s.\n", TranslateOpenCLError(err));
	}
	return err;

}
Ejemplo n.º 6
0
// --------------------
    magma_err_t
magma_zcopymatrix(
        magma_int_t m, magma_int_t n,
        magmaDoubleComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda,
        magmaDoubleComplex_ptr    dB_dst, size_t dB_offset, magma_int_t lddb,
        magma_queue_t queue )
{
    size_t src_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t dst_orig[3]   = { dB_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t region[3]        = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueCopyBufferRect(
            queue, dA_src, dB_dst,
            src_origin, dst_orig, region,
            ldda*sizeof(magmaDoubleComplex), 0,
            lddb*sizeof(magmaDoubleComplex), 0,
            0, NULL, NULL );
    return err;
}
Ejemplo n.º 7
0
// --------------------
extern "C" void
magma_zcopymatrix(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda,
    magmaDoubleComplex_ptr       dB_dst, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue )
{
    if (m <= 0 || n <= 0)
        return;

    size_t src_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t dst_orig[3]   = { dB_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t region[3]     = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueCopyBufferRect(
        queue, dA_src, dB_dst,
        src_origin, dst_orig, region,
        ldda*sizeof(magmaDoubleComplex), 0,
        lddb*sizeof(magmaDoubleComplex), 0,
        0, NULL, g_event );
    check_error( err );
}
Ejemplo n.º 8
0
    /// Enqueues a command to copy a rectangular region from
    /// \p src_buffer to \p dst_buffer.
    ///
    /// \see_opencl_ref{clEnqueueCopyBufferRect}
    ///
    /// \opencl_version_warning{1,1}
    event enqueue_copy_buffer_rect(const buffer &src_buffer,
                                   const buffer &dst_buffer,
                                   const size_t src_origin[3],
                                   const size_t dst_origin[3],
                                   const size_t region[3],
                                   size_t buffer_row_pitch,
                                   size_t buffer_slice_pitch,
                                   size_t host_row_pitch,
                                   size_t host_slice_pitch,
                                   const wait_list &events = wait_list())
    {
        BOOST_ASSERT(m_queue != 0);
        BOOST_ASSERT(src_buffer.get_context() == this->get_context());
        BOOST_ASSERT(dst_buffer.get_context() == this->get_context());

        event event_;

        cl_int ret = clEnqueueCopyBufferRect(
            m_queue,
            src_buffer.get(),
            dst_buffer.get(),
            src_origin,
            dst_origin,
            region,
            buffer_row_pitch,
            buffer_slice_pitch,
            host_row_pitch,
            host_slice_pitch,
            events.size(),
            events.get_event_ptr(),
            &event_.get()
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }

        return event_;
    }
Ejemplo n.º 9
0
// --------------------
extern "C" void
magma_ccopymatrix_async(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_ptr       dB_dst, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue, magma_event_t *event )
{
    if (m <= 0 || n <= 0)
        return;

    // TODO how to make non-blocking?
    size_t src_origin[3] = { dA_offset*sizeof(magmaFloatComplex), 0, 0 };
    size_t dst_orig[3]   = { dB_offset*sizeof(magmaFloatComplex), 0, 0 };
    size_t region[3]     = { m*sizeof(magmaFloatComplex), n, 1 };
    cl_int err = clEnqueueCopyBufferRect(
        queue, dA_src, dB_dst,
        src_origin, dst_orig, region,
        ldda*sizeof(magmaFloatComplex), 0,
        lddb*sizeof(magmaFloatComplex), 0,
        0, NULL, event );
    check_error( err );
}
Ejemplo n.º 10
0
       cl_mem bindTexture(const oclMat &mat)
        {
            cl_mem texture;
            cl_image_format format;
            int err;
            int depth    = mat.depth();
            int channels = mat.channels();

            switch(depth)
            {
            case CV_8U:
                format.image_channel_data_type = CL_UNSIGNED_INT8;
                break;
            case CV_32S:
                format.image_channel_data_type = CL_UNSIGNED_INT32;
                break;
            case CV_32F:
                format.image_channel_data_type = CL_FLOAT;
                break;
            default:
                throw std::exception();
                break;
            }
            switch(channels)
            {
            case 1:
                format.image_channel_order     = CL_R;
                break;
            case 3:
                format.image_channel_order     = CL_RGB;
                break;
            case 4:
                format.image_channel_order     = CL_RGBA;
                break;
            default:
                throw std::exception();
                break;
            }
#if CL_VERSION_1_2
            cl_image_desc desc;
            desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
            desc.image_width      = mat.cols;
            desc.image_height     = mat.rows;
            desc.image_depth      = 0;
            desc.image_array_size = 1;
            desc.image_row_pitch  = 0;
            desc.image_slice_pitch = 0;
            desc.buffer           = NULL;
            desc.num_mip_levels   = 0;
            desc.num_samples      = 0;
            texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
#else
            texture = clCreateImage2D(
                mat.clCxt->impl->clContext,
                CL_MEM_READ_WRITE,
                &format,
                mat.cols,
                mat.rows,
                0,
                NULL,
                &err);
#endif
            size_t origin[] = { 0, 0, 0 };
            size_t region[] = { mat.cols, mat.rows, 1 };

            cl_mem devData;
            if (mat.cols * mat.elemSize() != mat.step)
            {
                devData = clCreateBuffer(mat.clCxt->impl->clContext, CL_MEM_READ_ONLY, mat.cols * mat.rows
                    * mat.elemSize(), NULL, NULL);
                const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
                clEnqueueCopyBufferRect(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, devData, origin, origin,
                    regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
            }
            else
            {
                devData = (cl_mem)mat.data;
            }

            clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, devData, texture, 0, origin, region, 0, NULL, 0);
            if ((mat.cols * mat.elemSize() != mat.step))
            {
                clFinish(mat.clCxt->impl->clCmdQueue);
                clReleaseMemObject(devData);
            }

            openCLSafeCall(err);
            return texture;
        }
Ejemplo n.º 11
0
        cl_mem bindTexture(const oclMat &mat)
        {
            cl_mem texture;
            cl_image_format format;
            int err;
            int depth    = mat.depth();
            int channels = mat.oclchannels();

            switch(depth)
            {
            case CV_8U:
                format.image_channel_data_type = CL_UNSIGNED_INT8;
                break;
            case CV_32S:
                format.image_channel_data_type = CL_UNSIGNED_INT32;
                break;
            case CV_32F:
                format.image_channel_data_type = CL_FLOAT;
                break;
            default:
                CV_Error(-1, "Image forma is not supported");
                break;
            }
            switch(channels)
            {
            case 1:
                format.image_channel_order     = CL_R;
                break;
            case 3:
                format.image_channel_order     = CL_RGB;
                break;
            case 4:
                format.image_channel_order     = CL_RGBA;
                break;
            default:
                CV_Error(-1, "Image format is not supported");
                break;
            }
#ifdef CL_VERSION_1_2
            //this enables backwards portability to
            //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
            if(Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2))
            {
                cl_image_desc desc;
                desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
                desc.image_width      = mat.cols;
                desc.image_height     = mat.rows;
                desc.image_depth      = 0;
                desc.image_array_size = 1;
                desc.image_row_pitch  = 0;
                desc.image_slice_pitch = 0;
                desc.buffer           = NULL;
                desc.num_mip_levels   = 0;
                desc.num_samples      = 0;
                texture = clCreateImage(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
            }
            else
#endif
            {
                texture = clCreateImage2D(
                    *(cl_context*)mat.clCxt->getOpenCLContextPtr(),
                    CL_MEM_READ_WRITE,
                    &format,
                    mat.cols,
                    mat.rows,
                    0,
                    NULL,
                    &err);
            }
            size_t origin[] = { 0, 0, 0 };
            size_t region[] = { mat.cols, mat.rows, 1 };

            cl_mem devData;
            if (mat.cols * mat.elemSize() != mat.step)
            {
                devData = clCreateBuffer(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_ONLY, mat.cols * mat.rows
                    * mat.elemSize(), NULL, NULL);
                const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
                clEnqueueCopyBufferRect(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), (cl_mem)mat.data, devData, origin, origin,
                    regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
            }
            else
            {
                devData = (cl_mem)mat.data;
            }

            clEnqueueCopyBufferToImage(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), devData, texture, 0, origin, region, 0, NULL, 0);
            if ((mat.cols * mat.elemSize() != mat.step))
            {
                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
                clReleaseMemObject(devData);
            }

            openCLSafeCall(err);
            return texture;
        }
Ejemplo n.º 12
0
void b2CLCommonData::StoreJointImpulses()
{
	assert(numTotalJoints >= 0);

	if (numTotalJoints == 0)
		return;

	int sortCount = numTotalJoints;

#ifndef USE_CPU_SORT
	if (numTotalJoints < BITONIC_SORT_INTEL_MINNUM)
		sortCount = BITONIC_SORT_INTEL_MINNUM;
	else
	{
		// compute the least power-of-2 which >= m_contactCount
		int exp;
		frexp((float)numTotalJoints, &exp);
		sortCount = 1 << (exp-1);
		if (sortCount < numTotalJoints)
			sortCount <<= 1;
	}
#endif

	// Re-allocate impulse buffer if needed.
	if (sortCount > lastSortCount)
	{
		if (jointImpulsesBuffer)
			b2CLDevice::instance().freeArray(jointImpulsesBuffer);
		jointImpulsesBuffer = b2CLDevice::instance().allocateArray(sizeof(b2clJointImpulseNode) * sortCount);

		if (jointImpulseKeysBuffer)
			b2CLDevice::instance().freeArray(jointImpulseKeysBuffer);
		jointImpulseKeysBuffer = b2CLDevice::instance().allocateArray(sizeof(int) * sortCount);

		if (jointImpulseGlobalIndicesBuffer)
			b2CLDevice::instance().freeArray(jointImpulseGlobalIndicesBuffer);
		jointImpulseGlobalIndicesBuffer = b2CLDevice::instance().allocateArray(sizeof(int) * sortCount);
	}

	{
		const size_t origin[] = {0, 0, 0};
		const size_t region[] = {sizeof(b2clJointImpulseNode), numTotalJoints, 1};
		cl_int ciErrNum = clEnqueueCopyBufferRect(b2CLDevice::instance().GetCommandQueue(), jointListBuffer, jointImpulsesBuffer, origin, origin, region,
			sizeof(b2clJoint), 0, 0, 0, 0, NULL, NULL);
		b2clCheckError(ciErrNum, CL_SUCCESS);
	}

	{
		const size_t origin[] = {0, 0, 0};
		const size_t region[] = {sizeof(int), numTotalJoints, 1};
		cl_int ciErrNum = clEnqueueCopyBufferRect(b2CLDevice::instance().GetCommandQueue(), jointListBuffer, jointImpulseKeysBuffer, origin, origin, region,
			sizeof(b2clJoint), 0, 0, 0, 0, NULL, NULL);
		b2clCheckError(ciErrNum, CL_SUCCESS);

#ifndef USE_CPU_SORT
		// Because the buffer size is a power of 2, set unused part of the impulse key buffer to 0.
		unsigned int *zeroBuffer = new unsigned int[sortCount - numTotalJoints];
		memset(zeroBuffer, 0, sizeof(unsigned int) * (sortCount - numTotalJoints));
		b2CLDevice::instance().copyArrayToDevice(jointImpulseKeysBuffer, zeroBuffer, sizeof(int) * numTotalJoints, sizeof(unsigned int) * (sortCount - numTotalJoints), true);
		delete [] zeroBuffer;
#endif
	}

	{
		int* ascendingNumbers = new int[numTotalJoints];
		for (int i = 0; i < numTotalJoints; ++i)
		{
			ascendingNumbers[i] = i;
		}
		b2CLDevice::instance().copyArrayToDevice(jointImpulseGlobalIndicesBuffer, ascendingNumbers, 0, sizeof(int) * numTotalJoints, true);
		delete[] ascendingNumbers;
	}

	// sort by descending order of indices
#if defined(USE_CPU_SORT)
	b2CLSort::instance().stlSort(jointImpulseKeysBuffer, jointImpulseGlobalIndicesBuffer, numTotalJoints, 0, 1);
#else
	b2CLSort::instance().bitonicSort_Intel(jointImpulseKeysBuffer, jointImpulseGlobalIndicesBuffer, sortCount, 0);
#endif

	lastSortCount = sortCount;
}
Ejemplo n.º 13
0
int main(int argc, char *argv[])
{
  int error, xsize, ysize, rgb_max;
  int *r, *b, *g;

  float *gray, *congray, *congray_cl;

  // identity kernel
  // float filter[] = {
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,1,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  // };

  // 45 degree motion blur
  float filter[] =
    {0,      0,      0,      0,      0, 0.0145,      0,
     0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
     0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
     0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
     0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
     0, 0.0145,      0,      0,      0,      0,      0};

  // mexican hat kernel
  // float filter[] = {
  //   0, 0,-1,-1,-1, 0, 0,
  //   0,-1,-3,-3,-3,-1, 0,
  //  -1,-3, 0, 7, 0,-3,-1,
  //  -1,-3, 7,24, 7,-3,-1,
  //  -1,-3, 0, 7, 0,-3,-1,
  //   0,-1,-3,-3,-3,-1, 0,
  //   0, 0,-1,-1,-1, 0, 0
  // };


  if(argc != 3)
  {
    fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]);
    abort();
  }

  const char* filename = argv[1];
  const int num_loops = atoi(argv[2]);


  // --------------------------------------------------------------------------
  // load image
  // --------------------------------------------------------------------------
  printf("Reading ``%s''\n", filename);
  ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b);
  printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize);

  // --------------------------------------------------------------------------
  // allocate CPU buffers
  // --------------------------------------------------------------------------
  posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float));
  if(!gray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float));
  if(!congray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float));
  if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); }

  // --------------------------------------------------------------------------
  // convert image to grayscale
  // --------------------------------------------------------------------------
  for(int n = 0; n < xsize*ysize; ++n)
    gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max;

  // --------------------------------------------------------------------------
  // execute filter on cpu
  // --------------------------------------------------------------------------
  for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
  {
    for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
    {
      float sum = 0;
      for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k)
      {
        for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l)
        {
          sum += gray[(i+k)*xsize + (j+l)] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
        }
      }
      congray[i*xsize + j] = sum;
    }
  }

  // --------------------------------------------------------------------------
  // output cpu filtered image
  // --------------------------------------------------------------------------
  printf("Writing cpu filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n)
    r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max);
  error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // get an OpenCL context and queue
  // --------------------------------------------------------------------------
  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);
  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("convolution.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL);
  free(knl_text);

#ifdef NON_OPTIMIZED
  int deviceWidth = xsize;
#else
  int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX;
#endif
  int deviceHeight = ysize;
  size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float);

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
      deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        deviceDataSize, gray, 0, NULL, NULL));
#else
  size_t buffer_origin[3] = {0,0,0};
  size_t host_origin[3] = {0,0,0};
  size_t region[3] = {deviceWidth*sizeof(float), ysize, 1};
  clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE,
                           buffer_origin, host_origin, region,
                           deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
                           gray, 0, NULL, NULL);
#endif

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0,
        FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  cl_int rows = ysize;
  cl_int cols = xsize;
  cl_int filterWidth = FILTER_WIDTH;
  cl_int paddingPixels = 2*HALF_FILTER_WIDTH;

  size_t local_size[] = { WGX, WGY };
  size_t global_size[] = {
    ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0],
    ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1],
  };

  cl_int localWidth = local_size[0] + paddingPixels;
  cl_int localHeight = local_size[1] + paddingPixels;
  size_t localMemSize = localWidth * localHeight * sizeof(float);

  CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
  CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
  CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter));
  CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows));
  CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols));
  CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth));
  CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL));
  CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight));
  CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth));

  // --------------------------------------------------------------------------
  // print kernel info
  // --------------------------------------------------------------------------
  print_kernel_info(queue, knl);

  CALL_CL_SAFE(clFinish(queue));
  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int loop = 0; loop < num_loops; ++loop)
  {
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL,
          global_size, local_size, 0, NULL, NULL));

    // Edit: Copy the blurred image to input buffer
#ifdef NON_OPTIMIZED
    CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0,
          deviceDataSize, 0, NULL, NULL));
#else
    clEnqueueCopyBufferRect(queue, buf_congray, buf_gray,
                            buffer_origin, host_origin, region,
                            deviceWidth*sizeof(float), 0,
                            xsize*sizeof(float), 0,
                            0, NULL, NULL);
#endif
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops;
  printf("%f s\n", elapsed);
  printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed);
  printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed);
  printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH)
	 *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueReadBuffer(
        queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        xsize * ysize * sizeof(float), congray_cl,
        0, NULL, NULL));
#else
  buffer_origin[0] = 3*sizeof(float);
  buffer_origin[1] = 3;
  buffer_origin[2] = 0;

  host_origin[0] = 3*sizeof(float);
  host_origin[1] = 3;
  host_origin[2] = 0;

  region[0] = (xsize-paddingPixels)*sizeof(float);
  region[1] = (ysize-paddingPixels);
  region[2] = 1;

  clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,
      buffer_origin, host_origin, region,
      deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
      congray_cl, 0, NULL, NULL);
#endif

  // --------------------------------------------------------------------------
  // output OpenCL filtered image
  // --------------------------------------------------------------------------
  printf("Writing OpenCL filtered image\n");

  // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect
  for(int n = 0; n < xsize*ysize; ++n) {
    int color = (int)(congray_cl[n] * rgb_max);

    if (color < 0) {
      color = 0;
    } else if (color > 255) {
      color = 255;
    }

    r[n] = g[n] = b[n] = color;
  }
  error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_congray));
  CALL_CL_SAFE(clReleaseMemObject(buf_gray));
  CALL_CL_SAFE(clReleaseMemObject(buf_filter));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));
  free(gray);
  free(congray);
  free(congray_cl);
  free(r);
  free(b);
  free(g);
}
Ejemplo n.º 14
0
END_TEST

START_TEST (test_read_write_rect)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_mem buf, buf_part;

    // Grid xyz = (5 x 7 x 2)
    unsigned char grid[70] = {
        0, 0, 0, 0, 0,
        0, 1, 1, 1, 0,
        1, 2, 2, 2, 1,
        1, 2, 3, 2, 1,
        1, 2, 2, 2, 1,
        0, 1, 1, 1, 0,
        0, 0, 0, 0, 0,

        0, 0, 1, 0, 0,
        0, 0, 2, 0, 0,
        0, 1, 3, 1, 0,
        0, 2, 3, 2, 0,
        1, 3, 3, 3, 1,
        2, 3, 3, 3, 2,
        3, 3, 3, 3, 3
    };

    // Middle of the "image" : 3 x 3 x 2 centered at (3, 3)
    unsigned char part[18] = {
        2, 2, 2,
        2, 3, 2,
        2, 2, 2,

        1, 3, 1,
        2, 3, 2,
        3, 3, 3
    };

    unsigned char buffer[70], buffer_part[18];
    size_t host_origin[3] = {0, 0, 0};
    size_t buf_origin[3] = {0, 0, 0};
    size_t region[3] = {5, 7, 2};

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                         sizeof(buffer), buffer, &result);
    fail_if(
        result != CL_SUCCESS,
        "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
    );

    buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                              sizeof(buffer_part), buffer_part, &result);
    fail_if(
        result != CL_SUCCESS,
        "cannot create a buffer for the part that will be read"
    );

    // Write grid into buffer
    result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin,
                                      region, 0, 0, 0, 0, grid, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking write buffer rect event with pitches guessed"
    );
    fail_if(
        std::memcmp(buffer, grid, sizeof(buffer)) != 0,
        "buffer doesn't contain the data"
    );

    // Read it back into a temporary region
    buf_origin[0] = 1;
    buf_origin[1] = 2;
    buf_origin[2] = 0;
    // host_origin remains (0, 0, 0)
    region[0] = 3;
    region[1] = 3;
    region[2] = 2;

    result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin,
                                     region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to queue a blocking write buffer rect event with host pitches guessed"
    );
    fail_if(
        std::memcmp(buffer_part, part, sizeof(part)) != 0,
        "the part of the buffer was not correctly read"
    );

    // Clear the temporary region and re-read into it using buf_part
    std::memset(buffer_part, 0, sizeof(buffer_part));
    cl_event event;

    result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin,
                                     host_origin, region, 5, 5*7, 0, 0, 0, 0, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to queue a copy buffer rect event"
    );

    result = clWaitForEvents(1, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to wait for the event"
    );

    fail_if(
        std::memcmp(buffer_part, part, sizeof(part)) != 0,
        "the part of the buffer was not correctly read using a buffer"
    );

    clReleaseEvent(event);
    clReleaseMemObject(buf_part);
    clReleaseMemObject(buf);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}