/*! Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine to the region of this buffer defined by \a rect and \a bufferBytesPerLine. Returns true if the write was successful; false otherwise. This function will queue the request and return immediately. Returns an event object that can be used to wait for the request to finish. The request will not start until all of the events in \a after have been signaled as finished. The request is executed on the active command queue for context(). This function is only supported in OpenCL 1.1 and higher. \sa writeRect(), readRectAsync() */ QCLEvent QCLBuffer::writeRectAsync (const QRect &rect, const void *data, size_t bufferBytesPerLine, size_t hostBytesPerLine, const QCLEventList &after) { #ifdef QT_OPENCL_1_1 size_t bufferOrigin[3] = {rect.x(), rect.y(), 0}; size_t bufferRegion[3] = {rect.width(), rect.height(), 1}; static size_t const hostOrigin[3] = {0, 0, 0}; cl_event event; cl_int error = clEnqueueWriteBufferRect (context()->activeQueue(), memoryId(), CL_FALSE, bufferOrigin, hostOrigin, bufferRegion, bufferBytesPerLine, 0, hostBytesPerLine, 0, data, after.size(), after.eventData(), &event); context()->reportError("QCLBuffer::writeRectAsync:", error); if (error != CL_SUCCESS) return QCLEvent(); else return QCLEvent(event); #else context()->reportError("QCLBuffer::writeRectAsync:", CL_INVALID_OPERATION); Q_UNUSED(rect); Q_UNUSED(data); Q_UNUSED(bufferBytesPerLine); Q_UNUSED(hostBytesPerLine); Q_UNUSED(after); return QCLEvent(); #endif }
/*! Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine, and a slice pitch of \a hostBytesPerSlice, to the 3D region defined by \a origin, \a size, \a bufferBytesPerLine, and \a bufferBytesPerSlice in this buffer. This function will queue the request and return immediately. Returns an event object that can be used to wait for the request to finish. The request will not start until all of the events in \a after have been signaled as finished. The request is executed on the active command queue for context(). This function is only supported in OpenCL 1.1 and higher. \sa writeRect(), readRectAsync() */ QCLEvent QCLBuffer::writeRectAsync (const size_t origin[3], const size_t size[3], const void *data, size_t bufferBytesPerLine, size_t bufferBytesPerSlice, size_t hostBytesPerLine, size_t hostBytesPerSlice, const QCLEventList &after) { #ifdef QT_OPENCL_1_1 static size_t const hostOrigin[3] = {0, 0, 0}; cl_event event; cl_int error = clEnqueueWriteBufferRect (context()->activeQueue(), memoryId(), CL_FALSE, origin, hostOrigin, size, bufferBytesPerLine, bufferBytesPerSlice, hostBytesPerLine, hostBytesPerSlice, data, after.size(), after.eventData(), &event); context()->reportError("QCLBuffer::writeRectAsync(3D):", error); if (error != CL_SUCCESS) return QCLEvent(); else return QCLEvent(event); #else context()->reportError("QCLBuffer::writeRectAsync(3D):", CL_INVALID_OPERATION); Q_UNUSED(origin); Q_UNUSED(size); Q_UNUSED(data); Q_UNUSED(bufferBytesPerLine); Q_UNUSED(bufferBytesPerSlice); Q_UNUSED(hostBytesPerLine); Q_UNUSED(hostBytesPerSlice); Q_UNUSED(after); return QCLEvent(); #endif }
/*! Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine, and a slice pitch of \a hostBytesPerSlice, to the 3D region defined by \a origin, \a size, \a bufferBytesPerLine, and \a bufferBytesPerSlice in this buffer. Returns true if the write was successful; false otherwise. This function will block until the request finishes. The request is executed on the active command queue for context(). This function is only supported in OpenCL 1.1 and higher. \sa writeRectAsync(), readRect() */ bool QCLBuffer::writeRect (const size_t origin[3], const size_t size[3], const void *data, size_t bufferBytesPerLine, size_t bufferBytesPerSlice, size_t hostBytesPerLine, size_t hostBytesPerSlice) { #ifdef QT_OPENCL_1_1 static size_t const hostOrigin[3] = {0, 0, 0}; cl_int error = clEnqueueWriteBufferRect (context()->activeQueue(), memoryId(), CL_TRUE, origin, hostOrigin, size, bufferBytesPerLine, bufferBytesPerSlice, hostBytesPerLine, hostBytesPerSlice, data, 0, 0, 0); context()->reportError("QCLBuffer::writeRect(3D):", error); return error == CL_SUCCESS; #else context()->reportError("QCLBuffer::writeRect(3D):", CL_INVALID_OPERATION); Q_UNUSED(origin); Q_UNUSED(size); Q_UNUSED(data); Q_UNUSED(bufferBytesPerLine); Q_UNUSED(bufferBytesPerSlice); Q_UNUSED(hostBytesPerLine); Q_UNUSED(hostBytesPerSlice); return false; #endif }
/* H2D */ void copyPitched(cl_mem dst, const void* src, cl_command_queue queue, size_t w, size_t h, size_t pitch) const { const size_t offset[3] = {0, 0, 0}; size_t region[3] = {w, h, 1}; CHECK_CL(clEnqueueWriteBufferRect( queue, dst, Blocking<T_Async>::value, offset, // buffer origin offset, // host origin region, pitch, // buffer row pitch 0, // buffer slice pitch 0, // host row pitch 0, // host slice pitch src, 0, // num_events_in_wait_list nullptr, // event_wait_list nullptr )); // event }
void DeviceMatrixCL3D_copyToDevice(DeviceMatrixCL3D& self, const float* data) { if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) { const int mem_size = self.dim_y *self.dim_t * self.pitch_y; TheContext * tc = new TheContext(); size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = { self.dim_x * sizeof(float), self.dim_y, self.dim_t}; int err = clEnqueueWriteBufferRect( tc->getMyContext()->cqCommandQueue, self.dataMatrix, CL_TRUE, buffer_origin, host_origin, region, self.pitch_y, 0, sizeof(float) * self.dim_x, 0, data, 0, NULL, NULL); if (err != 0){ std::cout << "Error in copyToDevice (CODE: " << err << ")" << std::endl; } } }
magma_err_t magma_zsetmatrix_trace( magma_int_t m, magma_int_t n, magmaDoubleComplex const* hA_src, size_t hA_offset, magma_int_t ldha, magmaDoubleComplex_ptr dA_dst, size_t dA_offset, magma_int_t ldda, magma_queue_t queue, magma_event_t *event ) { size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t host_orig[3] = { hA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t region[3] = { m*sizeof(magmaDoubleComplex), n, 1 }; cl_int err = clEnqueueWriteBufferRect( queue, dA_dst, CL_TRUE, // blocking buffer_origin, host_orig, region, ldda*sizeof(magmaDoubleComplex), 0, ldha*sizeof(magmaDoubleComplex), 0, hA_src, 0, NULL, event ); return err; }
//// ======================================== //// copying sub-matrices (contiguous columns) //// OpenCL takes queue even for blocking transfers, oddly. magma_err_t magma_ssetmatrix( magma_int_t m, magma_int_t n, float const* hA_src, size_t hA_offset, magma_int_t ldha, magmaFloat_ptr dA_dst, size_t dA_offset, magma_int_t ldda, magma_queue_t queue ) { size_t buffer_origin[3] = { dA_offset*sizeof(float), 0, 0 }; size_t host_orig[3] = { 0, 0, 0 }; size_t region[3] = { m*sizeof(float), n, 1 }; cl_int err = clEnqueueWriteBufferRect( queue, dA_dst, CL_TRUE, // blocking buffer_origin, host_orig, region, ldda*sizeof(float), 0, ldha*sizeof(float), 0, hA_src, 0, NULL, NULL ); return err; }
// ======================================== // copying sub-matrices (contiguous columns) // OpenCL takes queue even for blocking transfers, oddly. extern "C" void magma_zsetmatrix( magma_int_t m, magma_int_t n, magmaDoubleComplex const* hA_src, magma_int_t ldha, magmaDoubleComplex_ptr dB_dst, size_t dB_offset, magma_int_t lddb, magma_queue_t queue ) { if (m <= 0 || n <= 0) return; size_t buffer_origin[3] = { dB_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t host_orig[3] = { 0, 0, 0 }; size_t region[3] = { m*sizeof(magmaDoubleComplex), n, 1 }; cl_int err = clEnqueueWriteBufferRect( queue, dB_dst, CL_TRUE, // blocking buffer_origin, host_orig, region, lddb*sizeof(magmaDoubleComplex), 0, ldha*sizeof(magmaDoubleComplex), 0, hA_src, 0, NULL, g_event ); check_error( err ); }
// -------------------- magma_err_t magma_zsetmatrix_async( magma_int_t m, magma_int_t n, magmaDoubleComplex const* hA_src, size_t hA_offset, magma_int_t ldha, magmaDoubleComplex_ptr dA_dst, size_t dA_offset, magma_int_t ldda, magma_queue_t queue, magma_event_t *event ) { if ( m<=0 || n <= 0 ) return MAGMA_SUCCESS; size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t host_orig[3] = { hA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t region[3] = { m*sizeof(magmaDoubleComplex), n, 1 }; cl_int err = clEnqueueWriteBufferRect( queue, dA_dst, CL_FALSE, // non-blocking buffer_origin, host_orig, region, ldda*sizeof(magmaDoubleComplex), 0, ldha*sizeof(magmaDoubleComplex), 0, hA_src, 0, NULL, event ); clFlush(queue); return err; }
// -------------------- extern "C" void magma_csetmatrix_async( magma_int_t m, magma_int_t n, magmaFloatComplex const* hA_src, magma_int_t ldha, magmaFloatComplex_ptr dB_dst, size_t dB_offset, magma_int_t lddb, magma_queue_t queue, magma_event_t *event ) { if (m <= 0 || n <= 0) return; size_t buffer_origin[3] = { dB_offset*sizeof(magmaFloatComplex), 0, 0 }; size_t host_orig[3] = { 0, 0, 0 }; size_t region[3] = { m*sizeof(magmaFloatComplex), n, 1 }; cl_int err = clEnqueueWriteBufferRect( queue, dB_dst, CL_FALSE, // non-blocking buffer_origin, host_orig, region, lddb*sizeof(magmaFloatComplex), 0, ldha*sizeof(magmaFloatComplex), 0, hA_src, 0, NULL, event ); clFlush(queue); check_error( err ); }
/*! Writes the bytes at \a data, with a line pitch of \a hostBytesPerLine to the region of this buffer defined by \a rect and \a bufferBytesPerLine. Returns true if the write was successful; false otherwise. This function will block until the request finishes. The request is executed on the active command queue for context(). This function is only supported in OpenCL 1.1 and higher. \sa writeRectAsync(), readRect() */ bool QCLBuffer::writeRect (const QRect &rect, const void *data, size_t bufferBytesPerLine, size_t hostBytesPerLine) { #ifdef QT_OPENCL_1_1 size_t bufferOrigin[3] = {rect.x(), rect.y(), 0}; size_t bufferRegion[3] = {rect.width(), rect.height(), 1}; static size_t const hostOrigin[3] = {0, 0, 0}; cl_int error = clEnqueueWriteBufferRect (context()->activeQueue(), memoryId(), CL_TRUE, bufferOrigin, hostOrigin, bufferRegion, bufferBytesPerLine, 0, hostBytesPerLine, 0, data, 0, 0, 0); context()->reportError("QCLBuffer::writeRect:", error); return error == CL_SUCCESS; #else context()->reportError("QCLBuffer::writeRect:", CL_INVALID_OPERATION); Q_UNUSED(rect); Q_UNUSED(data); Q_UNUSED(bufferBytesPerLine); Q_UNUSED(hostBytesPerLine); return false; #endif }
/// Enqueues a command to write a rectangular region from host memory /// to \p buffer. /// /// \see_opencl_ref{clEnqueueWriteBufferRect} /// /// \opencl_version_warning{1,1} void enqueue_write_buffer_rect(const buffer &buffer, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *host_ptr, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); BOOST_ASSERT(buffer.get_context() == this->get_context()); BOOST_ASSERT(host_ptr != 0); cl_int ret = clEnqueueWriteBufferRect( m_queue, buffer.get(), CL_TRUE, buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, host_ptr, events.size(), events.get_event_ptr(), 0 ); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } }
void b2CLCommonData::CopyJoints(b2World *m_pWorld, bool warmStarting, bool isJointChanged, bool isJointUpdated) { if (!isJointChanged && !isJointUpdated) return; if (isJointChanged) { if (warmStarting) StoreJointImpulses(); lastNumTotalJoints = numTotalJoints; DeleteJoints(); numTotalJoints = m_pWorld->m_jointCount; } if (numTotalJoints == 0) return; b2Joint* pJoint = NULL; if (isJointChanged) { pJoint = m_pWorld->m_jointList; for (int32 i = 0; i < numTotalJoints; ++i) { assert(pJoint->GetType() < numJointTypes); ++numJoints[pJoint->GetType()]; pJoint = pJoint->GetNext(); } jointListData = new b2clJoint [numTotalJoints]; memset(jointListData, 0, sizeof(b2clJoint) * numTotalJoints); jointListBuffer = b2CLDevice::instance().allocateArray(sizeof(b2clJoint) * numTotalJoints); } int32 jointIndex[numJointTypes]; int32 currentIndex = 0; for (int32 i = 0; i < numJointTypes; ++i) { jointIndex[i] = currentIndex; jointColorOffsets[i][0] = currentIndex; currentIndex += numJoints[i]; } pJoint = m_pWorld->m_jointList; for (int32 i = 0; i < m_pWorld->m_jointCount; ++i) { int32 jointType = pJoint->GetType(); assert(copyJointFunc[jointType]); (*copyJointFunc[jointType])(pJoint, &jointListData[jointIndex[jointType]++]); pJoint = pJoint->GetNext(); } ComputeJointColors(m_pWorld); if (isJointChanged) { b2CLDevice::instance().copyArrayToDevice(this->jointListBuffer, jointListData, 0, sizeof(b2clJoint) * numTotalJoints, true); if (warmStarting) ReadLastJointImpulses(); } else { // copy data without indices and impulses const size_t origin[] = {sizeof(b2clJointImpulseNode), 0, 0}; const size_t region[] = {sizeof(b2clJoint) - sizeof(b2clJointImpulseNode), numTotalJoints, 1}; cl_int ciErrNum = clEnqueueWriteBufferRect(b2CLDevice::instance().GetCommandQueue(), jointListBuffer, CL_TRUE, origin, origin, region, sizeof(b2clJoint), 0, sizeof(b2clJoint), 0, jointListData, 0, NULL, NULL); b2clCheckError(ciErrNum, CL_SUCCESS); } }
void roundtrip_func_rect() { timer.Start(timer_id); cl_int err; //rect size_t a_buffer_origin[3] = {0,0,0}; size_t a_host_origin[3] = {0,0,0}; size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; size_t a_buffer_row_pitch=0*sizeof(T);//lda size_t a_buffer_slice_pitch=0; size_t a_host_row_pitch=buffer_.lda_*sizeof(T); size_t a_host_slice_pitch=0; size_t b_buffer_origin[3] = {0,0,0}; size_t b_host_origin[3] = {0,0,0}; size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; size_t b_buffer_row_pitch=0*sizeof(T);//ldb size_t b_buffer_slice_pitch=0; size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); size_t b_host_slice_pitch=0; size_t c_buffer_origin[3] = {0,0,0}; size_t c_host_origin[3] = {0,0,0}; size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; size_t c_buffer_row_pitch=0*sizeof(T);//ldc size_t c_buffer_slice_pitch=0; size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); size_t c_host_slice_pitch=0; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.k_*buffer_.m_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.k_ * buffer_.n_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.m_ * buffer_.n_ + buffer_.offC_) * sizeof(T), NULL, &err); /* err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL);*/ err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch, a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch, b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); if(buffer_.trans_a_==clblasNoTrans) { buffer_.lda_=buffer_.m_; } else { buffer_.lda_=buffer_.k_; } if(buffer_.trans_b_==clblasNoTrans) { buffer_.ldb_=buffer_.k_; } else { buffer_.ldb_=buffer_.n_; } buffer_.ldc_=buffer_.m_; xGemm_Function(false); /* err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, &event_); */ err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); }
WEAK int halide_copy_to_dev(void *user_context, buffer_t* buf) { int err = halide_dev_malloc(user_context, buf); if (err) { return err; } DEBUG_PRINTF(user_context, "CL: halide_copy_to_dev (user_context: %p, buf: %p)\n", user_context, buf ); // Acquire the context so we can use the command queue. This also avoids multiple // redundant calls to clEnqueueWriteBuffer when multiple threads are trying to copy // the same buffer. ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } if (buf->host_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->host && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_host_to_dev_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { #ifdef ENABLE_OPENCL_11 // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we // can deal with the 2 innermost strides with OpenCL. uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3]; size_t offset[3] = { off, 0, 0 }; size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] }; DEBUG_PRINTF( user_context, " clEnqueueWriteBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n", z, w, (void *)c.src, c.dst, (int)off, (int)region[0], (int)region[1], (int)region[2], (int)c.stride_bytes[0], (int)c.stride_bytes[1]); cl_int err = clEnqueueWriteBufferRect(ctx.cmd_queue, (cl_mem)c.dst, CL_FALSE, offset, offset, region, c.stride_bytes[0], c.stride_bytes[1], c.stride_bytes[0], c.stride_bytes[1], (void *)c.src, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueWriteBufferRect failed (%d)\n", err); return err; } #else for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); void *src = (void *)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " clEnqueueWriteBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n", x, y, z, w, (long long)size, src, (void *)dst ); cl_int err = clEnqueueWriteBuffer(ctx.cmd_queue, (cl_mem)c.dst, CL_FALSE, off, size, src, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueWriteBuffer failed (%d)\n", err); return err; } } } #endif } } // The writes above are all non-blocking, so empty the command // queue before we proceed so that other host code won't write // to the buffer while the above writes are still running. clFinish(ctx.cmd_queue); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->host_dirty = false; return 0; }
int main(int argc, char** argv) { // Set up the data on the host clock_t start, start0; start0 = clock(); start = clock(); // Rows and columns in the input image int imageHeight; int imageWidth; const char* inputFile = "input.bmp"; const char* outputFile = "output.bmp"; // Homegrown function to read a BMP from file float* inputImage = readImage(inputFile, &imageWidth, &imageHeight); // Size of the input and output images on the host int dataSize = imageHeight*imageWidth*sizeof(float); // Pad the number of columns #ifdef NON_OPTIMIZED int deviceWidth = imageWidth; #else // READ_ALIGNED || READ4 int deviceWidth = roundUp(imageWidth, WGX); #endif int deviceHeight = imageHeight; // Size of the input and output images on the device int deviceDataSize = imageHeight*deviceWidth*sizeof(float); // Output image on the host float* outputImage = NULL; outputImage = (float*)malloc(dataSize); int i, j; for(i = 0; i < imageHeight; i++) { for(j = 0; j < imageWidth; j++) { outputImage[i*imageWidth+j] = 0; } } // 45 degree motion blur float filter[49] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; int filterWidth = 7; int paddingPixels = (int)(filterWidth/2) * 2; stoptime(start, "set up input, output."); start = clock(); // Set up the OpenCL environment // Discovery platform cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); // Discover device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); size_t time_res; clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL); printf("Device profiling timer resolution: %zu ns.\n", time_res); // Create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, NULL); // Create command queue cl_ulong time_start, time_end, exec_time; cl_event timing_event; cl_command_queue queue; queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); // Create memory buffers cl_mem d_inputImage; cl_mem d_outputImage; cl_mem d_filter; d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL); d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL); d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 49*sizeof(float),NULL, NULL); // Write input data to the device #ifdef NON_OPTIMIZED clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize, inputImage, 0, NULL, NULL); #else // READ_ALIGNED || READ4 size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), imageHeight, 1}; clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, inputImage, 0, NULL, NULL); #endif // Write the filter to the device clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 49*sizeof(float), filter, 0, NULL, NULL); // Read in the program from file char* source = readSource("convolution.cl"); // Create the program cl_program program; // Create and compile the program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); cl_int build_status; build_status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Create the kernel cl_kernel kernel; #if defined NON_OPTIMIZED || defined READ_ALIGNED // Only the host-side code differs for the aligned reads kernel = clCreateKernel(program, "convolution", NULL); #else // READ4 kernel = clCreateKernel(program, "convolution_read4", NULL); #endif // Selected work group size is 16x16 int wgWidth = WGX; int wgHeight = WGY; // When computing the total number of work items, the // padding work items do not need to be considered int totalWorkItemsX = roundUp(imageWidth-paddingPixels, wgWidth); int totalWorkItemsY = roundUp(imageHeight-paddingPixels, wgHeight); // Size of a work group size_t localSize[2] = {wgWidth, wgHeight}; // Size of the NDRange size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY}; // The amount of local data that is cached is the size of the // work groups plus the padding pixels #if defined NON_OPTIMIZED || defined READ_ALIGNED int localWidth = localSize[0] + paddingPixels; #else // READ4 // Round the local width up to 4 for the read4 kernel int localWidth = roundUp(localSize[0]+paddingPixels, 4); #endif int localHeight = localSize[1] + paddingPixels; // Compute the size of local memory (needed for dynamic // allocation) size_t localMemSize = (localWidth * localHeight * sizeof(float)); // Set the kernel arguments clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter); clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight); clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); clSetKernelArg(kernel, 6, localMemSize, NULL); clSetKernelArg(kernel, 7, sizeof(int), &localHeight); clSetKernelArg(kernel, 8, sizeof(int), &localWidth); stoptime(start, "set up kernel"); start = clock(); // Execute the kernel clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, &timing_event); // Wait for kernel to complete clFinish(queue); stoptime(start, "run kernel"); clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); exec_time = time_end-time_start; printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000); // Read back the output image #ifdef NON_OPTIMIZED clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, deviceDataSize, outputImage, 0, NULL, NULL); #else // READ_ALIGNED || READ4 // Begin reading output from (3,3) on the device // (for 7x7 filter with radius 3) buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; // Read data into (3,3) on the host host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; // Region is image size minus padding pixels region[0] = (imageWidth-paddingPixels)*sizeof(float); region[1] = (imageHeight-paddingPixels); region[2] = 1; // Perform the read clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, outputImage, 0, NULL, NULL); #endif // Homegrown function to write the image to file storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); // Free OpenCL objects clReleaseMemObject(d_inputImage); clReleaseMemObject(d_outputImage); clReleaseMemObject(d_filter); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }
/** Thread that receives data from client. * @param data struct dataTransfer casted variable. * @return NULL */ void *asyncDataRecvRect_thread(void *data) { unsigned int i,j,k,n; size_t buffsize = BUFF_SIZE*sizeof(char); struct dataSend* _data = (struct dataSend*)data; size_t host_origin[3] = {0, 0, 0}; // Receive the data int *fd = &(_data->fd); Send(fd, &buffsize, sizeof(size_t), 0); // Compute the number of packages needed n = _data->host_row_pitch / buffsize; // Receive the rows size_t origin = 0; for(j=0;j<_data->region[1];j++){ for(k=0;k<_data->region[2];k++){ // Receive package by pieces for(i=0;i<n;i++){ Recv(fd, _data->ptr + i*buffsize + origin, buffsize, MSG_WAITALL); } if(_data->host_row_pitch % buffsize){ // Remains some data to arrive Recv(fd, _data->ptr + n*buffsize + origin, _data->host_row_pitch % buffsize, MSG_WAITALL); } // Compute the new origin origin += _data->host_row_pitch; } } // We may wait manually for the events provided because // OpenCL can only waits their events, but ocland event // can be relevant. We will not check for errors, // assuming than events can be wrong, but is to late to // try to report a fail. if(_data->num_events_in_wait_list){ oclandWaitForEvents(_data->num_events_in_wait_list, _data->event_wait_list); } // Call to OpenCL clEnqueueWriteBufferRect(_data->command_queue,_data->mem,CL_FALSE, _data->buffer_origin,host_origin,_data->region, _data->buffer_row_pitch,_data->buffer_slice_pitch, _data->host_row_pitch,_data->host_slice_pitch, _data->ptr,0,NULL,&(_data->event->event)); // Wait until data is copied here. We will not test // for errors, user can do it later clWaitForEvents(1,&(_data->event->event)); free(_data->buffer_origin); _data->buffer_origin = NULL; free(_data->region); _data->region = NULL; free(_data->ptr); _data->ptr = NULL; if(_data->event){ _data->event->status = CL_COMPLETE; } if(_data->want_event != CL_TRUE){ free(_data->event); _data->event = NULL; } if(_data->event_wait_list) free(_data->event_wait_list); _data->event_wait_list=NULL; // shutdown(fd, 2); // shutdown(_data->fd, 2); // Destroy the server to free the port close(fd); close(_data->fd); free(_data); _data=NULL; pthread_exit(NULL); return NULL; }
END_TEST START_TEST (test_read_write_rect) { cl_platform_id platform = 0; cl_device_id device; cl_context ctx; cl_command_queue queue; cl_int result; cl_mem buf, buf_part; // Grid xyz = (5 x 7 x 2) unsigned char grid[70] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 1, 2, 3, 2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 3, 1, 0, 0, 2, 3, 2, 0, 1, 3, 3, 3, 1, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3 }; // Middle of the "image" : 3 x 3 x 2 centered at (3, 3) unsigned char part[18] = { 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 3, 1, 2, 3, 2, 3, 3, 3 }; unsigned char buffer[70], buffer_part[18]; size_t host_origin[3] = {0, 0, 0}; size_t buf_origin[3] = {0, 0, 0}; size_t region[3] = {5, 7, 2}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer), buffer, &result); fail_if( result != CL_SUCCESS, "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer" ); buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer_part), buffer_part, &result); fail_if( result != CL_SUCCESS, "cannot create a buffer for the part that will be read" ); // Write grid into buffer result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin, region, 0, 0, 0, 0, grid, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking write buffer rect event with pitches guessed" ); fail_if( std::memcmp(buffer, grid, sizeof(buffer)) != 0, "buffer doesn't contain the data" ); // Read it back into a temporary region buf_origin[0] = 1; buf_origin[1] = 2; buf_origin[2] = 0; // host_origin remains (0, 0, 0) region[0] = 3; region[1] = 3; region[2] = 2; result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin, region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0); fail_if( result != CL_SUCCESS, "unable to queue a blocking write buffer rect event with host pitches guessed" ); fail_if( std::memcmp(buffer_part, part, sizeof(part)) != 0, "the part of the buffer was not correctly read" ); // Clear the temporary region and re-read into it using buf_part std::memset(buffer_part, 0, sizeof(buffer_part)); cl_event event; result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin, host_origin, region, 5, 5*7, 0, 0, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy buffer rect event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for the event" ); fail_if( std::memcmp(buffer_part, part, sizeof(part)) != 0, "the part of the buffer was not correctly read using a buffer" ); clReleaseEvent(event); clReleaseMemObject(buf_part); clReleaseMemObject(buf); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
int main(int argc, char *argv[]) { int error, xsize, ysize, rgb_max; int *r, *b, *g; float *gray, *congray, *congray_cl; // identity kernel // float filter[] = { // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,1,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // }; // 45 degree motion blur float filter[] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; // mexican hat kernel // float filter[] = { // 0, 0,-1,-1,-1, 0, 0, // 0,-1,-3,-3,-3,-1, 0, // -1,-3, 0, 7, 0,-3,-1, // -1,-3, 7,24, 7,-3,-1, // -1,-3, 0, 7, 0,-3,-1, // 0,-1,-3,-3,-3,-1, 0, // 0, 0,-1,-1,-1, 0, 0 // }; if(argc != 3) { fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]); abort(); } const char* filename = argv[1]; const int num_loops = atoi(argv[2]); // -------------------------------------------------------------------------- // load image // -------------------------------------------------------------------------- printf("Reading ``%s''\n", filename); ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b); printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize); // -------------------------------------------------------------------------- // allocate CPU buffers // -------------------------------------------------------------------------- posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float)); if(!gray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float)); if(!congray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float)); if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); } // -------------------------------------------------------------------------- // convert image to grayscale // -------------------------------------------------------------------------- for(int n = 0; n < xsize*ysize; ++n) gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max; // -------------------------------------------------------------------------- // execute filter on cpu // -------------------------------------------------------------------------- for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { float sum = 0; for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k) { for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l) { sum += gray[(i+k)*xsize + (j+l)] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; } } congray[i*xsize + j] = sum; } } // -------------------------------------------------------------------------- // output cpu filtered image // -------------------------------------------------------------------------- printf("Writing cpu filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max); error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // get an OpenCL context and queue // -------------------------------------------------------------------------- cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("convolution.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL); free(knl_text); #ifdef NON_OPTIMIZED int deviceWidth = xsize; #else int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX; #endif int deviceHeight = ysize; size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float); // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0, deviceDataSize, gray, 0, NULL, NULL)); #else size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), ysize, 1}; clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, gray, 0, NULL, NULL); #endif CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL)); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- cl_int rows = ysize; cl_int cols = xsize; cl_int filterWidth = FILTER_WIDTH; cl_int paddingPixels = 2*HALF_FILTER_WIDTH; size_t local_size[] = { WGX, WGY }; size_t global_size[] = { ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0], ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1], }; cl_int localWidth = local_size[0] + paddingPixels; cl_int localHeight = local_size[1] + paddingPixels; size_t localMemSize = localWidth * localHeight * sizeof(float); CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter)); CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows)); CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols)); CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth)); CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL)); CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight)); CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth)); // -------------------------------------------------------------------------- // print kernel info // -------------------------------------------------------------------------- print_kernel_info(queue, knl); CALL_CL_SAFE(clFinish(queue)); timestamp_type tic, toc; get_timestamp(&tic); for(int loop = 0; loop < num_loops; ++loop) { CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL, global_size, local_size, 0, NULL, NULL)); // Edit: Copy the blurred image to input buffer #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0, deviceDataSize, 0, NULL, NULL)); #else clEnqueueCopyBufferRect(queue, buf_congray, buf_gray, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, 0, NULL, NULL); #endif } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops; printf("%f s\n", elapsed); printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed); printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed); printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH) *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed); // -------------------------------------------------------------------------- // transfer back & check // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueReadBuffer( queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0, xsize * ysize * sizeof(float), congray_cl, 0, NULL, NULL)); #else buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; region[0] = (xsize-paddingPixels)*sizeof(float); region[1] = (ysize-paddingPixels); region[2] = 1; clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, congray_cl, 0, NULL, NULL); #endif // -------------------------------------------------------------------------- // output OpenCL filtered image // -------------------------------------------------------------------------- printf("Writing OpenCL filtered image\n"); // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect for(int n = 0; n < xsize*ysize; ++n) { int color = (int)(congray_cl[n] * rgb_max); if (color < 0) { color = 0; } else if (color > 255) { color = 255; } r[n] = g[n] = b[n] = color; } error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_congray)); CALL_CL_SAFE(clReleaseMemObject(buf_gray)); CALL_CL_SAFE(clReleaseMemObject(buf_filter)); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(gray); free(congray); free(congray_cl); free(r); free(b); free(g); }