void procOCL_OCV(int texIn, int texOut, int w, int h) { LOGD("Processing OpenCL via OpenCV"); if(!haveOpenCL) { LOGE("OpenCL isn't initialized"); return; } int64_t t = getTimeMs(); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, texIn); std::vector < cl::Memory > images(1, imgIn); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); cv::UMat uIn, uOut, uTmp; cv::ocl::convertFromImage(imgIn(), uIn); LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t)); theQueue.enqueueReleaseGLObjects(&images); t = getTimeMs(); //cv::blur(uIn, uOut, cv::Size(5, 5)); cv::Laplacian(uIn, uTmp, CV_8U); cv:multiply(uTmp, 10, uOut); cv::ocl::finish(); LOGD("OpenCV processing costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut); images.clear(); images.push_back(imgOut); theQueue.enqueueAcquireGLObjects(&images); cl_mem clBuffer = (cl_mem)uOut.handle(cv::ACCESS_READ); cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr(); size_t offset = 0; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { w, h, 1 }; CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS); theQueue.enqueueReleaseGLObjects(&images); cv::ocl::finish(); LOGD("uploading results to texture costs %d ms", getTimeInterval(t)); }
/*! Copies the contents of this buffer, starting at \a offset to \a rect within \a dest. Returns true if the copy was successful; false otherwise. This function will block until the request finishes. The request is executed on the active command queue for context(). \sa copyToAsync() */ bool QCLBuffer::copyTo (size_t offset, const QCLImage2D &dest, const QRect &rect) { const size_t dst_origin[3] = {static_cast<size_t>(rect.x()), static_cast<size_t>(rect.y()), 0 }; const size_t region[3] = {static_cast<size_t>(rect.width()), static_cast<size_t>(rect.height()), 1 }; cl_event event; cl_int error = clEnqueueCopyBufferToImage (context()->activeQueue(), memoryId(), dest.memoryId(), offset, dst_origin, region, 0, 0, &event); context()->reportError("QCLBuffer::copyTo(QCLImage2D):", error); if (error == CL_SUCCESS) { clWaitForEvents(1, &event); clReleaseEvent(event); return true; } else { return false; } }
/*! \param src Valid device buffer \param dst Empty device image \param mem_size Size of data to copy */ void cl_copyBufferToImage(cl_mem buffer, cl_mem image, int height, int width) { static int eventCnt = 0; cl_event* eventPtr = NULL, event; if(eventsEnabled) { eventPtr = &event; } size_t origin[3] = {0, 0, 0}; size_t region[3] = {width, height, 1}; cl_int status; status = clEnqueueCopyBufferToImage(commandQueue, buffer, image, 0, origin, region, 0, NULL, eventPtr); cl_errChk(status, "Copying buffer to image", true); if(eventsEnabled) { char* eventStr = catStringWithInt("copyBufferToImage", eventCnt++); events->newIOEvent(*eventPtr, eventStr); } }
/*! Copies the contents of this buffer, starting at \a offset to \a rect within \a dest. Returns an event object that can be used to wait for the request to finish. The request will not start until all of the events in \a after have been signaled as finished. The request is executed on the active command queue for context(). \sa copyTo() */ QCLEvent QCLBuffer::copyToAsync (size_t offset, const QCLImage2D &dest, const QRect &rect, const QCLEventList &after) { // const size_t dst_origin[3] = {rect.x(), rect.y(), 0}; // const size_t region[3] = {rect.width(), rect.height(), 1}; const size_t dst_origin[3] = {static_cast<size_t>(rect.x()), static_cast<size_t>(rect.y()), 0 }; const size_t region[3] = {static_cast<size_t>(rect.width()), static_cast<size_t>(rect.height()), 1 }; cl_event event; cl_int error = clEnqueueCopyBufferToImage (context()->activeQueue(), memoryId(), dest.memoryId(), offset, dst_origin, region, after.size(), after.eventData(), &event); context()->reportError("QCLBuffer::copyToAsync(QCLImage2D):", error); if (error == CL_SUCCESS) return QCLEvent(event); else return QCLEvent(); }
int bindTexture(int offset, cl_mem* texture, cl_mem memory, size_t size, cl_channel_type dataType) { size_t origin[3]; origin[0] = 0; origin[1] = 0; origin[2] = 0; size_t region[3]; region[0] = size <= MaxImageWidth ? size : MaxImageWidth; region[1] = (size + MaxImageWidth - 1) / MaxImageWidth; region[2] = 1; if(region[0] == 0) region[0] = 1; if(region[1] == 0) region[1] = 1; cl_image_format format; format.image_channel_order = CL_R; format.image_channel_data_type = dataType; int err = CL_SUCCESS; //#ifdef CL_VERSION_1_2 // cl_image_desc desc = {0}; // desc.image_width = region[0]; // desc.image_height = region[1]; // // *texture = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &err); //#else *texture = clCreateImage2D(context, CL_MEM_READ_ONLY, &format, region[0], region[1], 0, NULL, &err); //#endif CHKERR(err, "Unable to create texture!"); if(size != 0) err = clEnqueueCopyBufferToImage(commands, memory, *texture, 0, origin, region, 0, NULL, NULL); CHKERR(err, "Unable to buffer texture!"); return CL_SUCCESS; }
//----------------------------------------------------------------------------- //! Run the CL part of the computation //----------------------------------------------------------------------------- void RunKernels() { static float t = 0.0f; // ---------------------------------------------------------------- // populate the 2d texture { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_2d.width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_2d.height); // set the args values #ifdef USE_STAGING_BUFFER ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clMem), (void *) &(g_texture_2d.clMem)); #else ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture)); #endif ciErrNum |= clSetKernelArg(ckKernel_tex2d, 1, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture)); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 2, sizeof(g_texture_2d.width), &g_texture_2d.width); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 3, sizeof(g_texture_2d.height), &g_texture_2d.height); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 4, sizeof(g_texture_2d.pitch), &g_texture_2d.pitch); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 5, sizeof(t), &t); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_tex2d, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_2d.width, g_texture_2d.height, 1}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_2d.clMem /* src_buffer */, g_texture_2d.clTexture /* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } // ---------------------------------------------------------------- // populate the volume texture { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_vol.width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_vol.height); // set the args values ciErrNum |= clSetKernelArg(ckKernel_texvolume, 0, sizeof(g_texture_vol.clMem), (void *) &(g_texture_vol.clMem)); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 1, sizeof(g_texture_vol.width), &g_texture_vol.width); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 2, sizeof(g_texture_vol.height), &g_texture_vol.height); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 3, sizeof(g_texture_vol.depth), &g_texture_vol.depth); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 4, sizeof(g_texture_vol.pitch), &g_texture_vol.pitch); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 5, sizeof(g_texture_vol.pitchslice), &g_texture_vol.pitchslice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texvolume, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ONLY staging buffer works, for volume texture // do the copy here size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_vol.clMem /* src_buffer */, g_texture_vol.clTexture /* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); } // ---------------------------------------------------------------- // populate the faces of the cube map for (int face = 0; face < 6; ++face) { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_cube.size); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_cube.size); // set the args values #ifdef USE_STAGING_BUFFER ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clMem[face]), (void *) &(g_texture_cube.clMem[face])); #else ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clTexture[face]), (void *) &(g_texture_cube.clTexture[face])); #endif ciErrNum |= clSetKernelArg(ckKernel_texcube, 1, sizeof(g_texture_cube.size), &g_texture_cube.size); ciErrNum |= clSetKernelArg(ckKernel_texcube, 2, sizeof(g_texture_cube.pitch), &g_texture_cube.pitch); ciErrNum |= clSetKernelArg(ckKernel_texcube, 3, sizeof(int), &face); ciErrNum |= clSetKernelArg(ckKernel_texcube, 4, sizeof(t), &t); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texcube, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_cube.size, g_texture_cube.size, 1}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_cube.clMem[face]/* src_buffer */, g_texture_cube.clTexture[face]/* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } t += 0.1f; }
int main(void) { cl_int err; cl_platform_id platforms[MAX_PLATFORMS]; cl_uint nplatforms; cl_device_id devices[MAX_DEVICES]; cl_uint ndevices; cl_uint i, j; size_t el, row, col; CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms)); for (i = 0; i < nplatforms; i++) { CHECK_CL_ERROR(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices)); /* Only test the devices we actually have room for */ if (ndevices > MAX_DEVICES) ndevices = MAX_DEVICES; for (j = 0; j < ndevices; j++) { /* skip devices that do not support images */ cl_bool has_img; CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof(has_img), &has_img, NULL)); if (!has_img) continue; cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateContext"); cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err); CHECK_OPENCL_ERROR_IN("clCreateCommandQueue"); cl_ulong alloc; size_t max_height; size_t max_width; #define MAXALLOC (1024U*1024U) CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc), &alloc, NULL)); CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(max_width), &max_width, NULL)); CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(max_height), &max_height, NULL)); while (alloc > MAXALLOC) alloc /= 2; // fit at least one max_width inside the alloc (shrink max_width for this) while (max_width*pixel_size > alloc) max_width /= 2; // round number of elements to next multiple of max_width elements const size_t nels = (alloc/pixel_size/max_width)*max_width; const size_t buf_size = nels*pixel_size; cl_image_desc img_desc; memset(&img_desc, 0, sizeof(img_desc)); img_desc.image_type = CL_MEM_OBJECT_IMAGE2D; img_desc.image_width = max_width; img_desc.image_height = nels/max_width; img_desc.image_depth = 1; cl_ushort null_pixel[4] = {0, 0, 0, 0}; cl_ushort *host_buf = malloc(buf_size); TEST_ASSERT(host_buf); for (el = 0; el < nels; el+=4) { host_buf[el] = el & CHANNEL_MAX; host_buf[el+1] = (CHANNEL_MAX - el) & CHANNEL_MAX; host_buf[el+2] = (CHANNEL_MAX/((el & 1) + 1) - el) & CHANNEL_MAX; host_buf[el+3] = (CHANNEL_MAX - el/((el & 1) + 1)) & CHANNEL_MAX; } cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateBuffer"); cl_mem img = clCreateImage(context, CL_MEM_READ_WRITE, &img_format, &img_desc, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateImage"); CHECK_CL_ERROR(clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, buf_size, host_buf, 0, NULL, NULL)); const size_t offset = 0; const size_t origin[] = {0, 0, 0}; const size_t region[] = {img_desc.image_width, img_desc.image_height, 1}; CHECK_CL_ERROR(clEnqueueCopyBufferToImage(queue, buf, img, offset, origin, region, 0, NULL, NULL)); size_t row_pitch, slice_pitch; cl_ushort *img_map = clEnqueueMapImage(queue, img, CL_TRUE, CL_MAP_READ, origin, region, &row_pitch, &slice_pitch, 0, NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clEnqueueMapImage"); CHECK_CL_ERROR(clFinish(queue)); for (row = 0; row < img_desc.image_height; ++row) { for (col = 0; col < img_desc.image_width; ++col) { cl_ushort *img_pixel = (cl_ushort*)((char*)img_map + row*row_pitch) + col*4; cl_ushort *buf_pixel = host_buf + (row*img_desc.image_width + col)*4; if (memcmp(img_pixel, buf_pixel, pixel_size) != 0) printf("%zu %zu %zu : %x %x %x %x | %x %x %x %x\n", row, col, (size_t)(buf_pixel - host_buf), buf_pixel[0], buf_pixel[1], buf_pixel[2], buf_pixel[3], img_pixel[0], img_pixel[1], img_pixel[2], img_pixel[3]); TEST_ASSERT(memcmp(img_pixel, buf_pixel, pixel_size) == 0); } } CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, img, img_map, 0, NULL, NULL)); /* Clear the buffer, and ensure it has been cleared */ CHECK_CL_ERROR(clEnqueueFillBuffer(queue, buf, null_pixel, sizeof(null_pixel), 0, buf_size, 0, NULL, NULL)); cl_ushort *buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clEnqueueMapBuffer"); CHECK_CL_ERROR(clFinish(queue)); for (el = 0; el < nels; ++el) { #if 0 // debug if (buf_map[el] != 0) { printf("%zu/%zu => %u\n", el, nels, buf_map[el]); } #endif TEST_ASSERT(buf_map[el] == 0); } CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, buf, buf_map, 0, NULL, NULL)); /* Copy data from image to buffer, and check that it's again equal to the original buffer */ CHECK_CL_ERROR(clEnqueueCopyImageToBuffer(queue, img, buf, origin, region, offset, 0, NULL, NULL)); buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err); CHECK_CL_ERROR(clFinish(queue)); TEST_ASSERT(memcmp(buf_map, host_buf, buf_size) == 0); CHECK_CL_ERROR ( clEnqueueUnmapMemObject (queue, buf, buf_map, 0, NULL, NULL)); CHECK_CL_ERROR (clFinish (queue)); free(host_buf); CHECK_CL_ERROR (clReleaseMemObject (img)); CHECK_CL_ERROR (clReleaseMemObject (buf)); CHECK_CL_ERROR (clReleaseCommandQueue (queue)); CHECK_CL_ERROR (clReleaseContext (context)); } } return EXIT_SUCCESS; }
cl_mem bindTexture(const oclMat &mat) { cl_mem texture; cl_image_format format; int err; int depth = mat.depth(); int channels = mat.channels(); switch(depth) { case CV_8U: format.image_channel_data_type = CL_UNSIGNED_INT8; break; case CV_32S: format.image_channel_data_type = CL_UNSIGNED_INT32; break; case CV_32F: format.image_channel_data_type = CL_FLOAT; break; default: throw std::exception(); break; } switch(channels) { case 1: format.image_channel_order = CL_R; break; case 3: format.image_channel_order = CL_RGB; break; case 4: format.image_channel_order = CL_RGBA; break; default: throw std::exception(); break; } #if CL_VERSION_1_2 cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = mat.cols; desc.image_height = mat.rows; desc.image_depth = 0; desc.image_array_size = 1; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.buffer = NULL; desc.num_mip_levels = 0; desc.num_samples = 0; texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); #else texture = clCreateImage2D( mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, mat.cols, mat.rows, 0, NULL, &err); #endif size_t origin[] = { 0, 0, 0 }; size_t region[] = { mat.cols, mat.rows, 1 }; cl_mem devData; if (mat.cols * mat.elemSize() != mat.step) { devData = clCreateBuffer(mat.clCxt->impl->clContext, CL_MEM_READ_ONLY, mat.cols * mat.rows * mat.elemSize(), NULL, NULL); const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1}; clEnqueueCopyBufferRect(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, devData, origin, origin, regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL); } else { devData = (cl_mem)mat.data; } clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, devData, texture, 0, origin, region, 0, NULL, 0); if ((mat.cols * mat.elemSize() != mat.step)) { clFinish(mat.clCxt->impl->clCmdQueue); clReleaseMemObject(devData); } openCLSafeCall(err); return texture; }
static int Recompute(void) { void *values[10]; size_t sizes[10]; size_t global[2]; size_t local[2]; int arg = 0; int err = 0; float bias[2] = { fabs(Bias[0]), fabs(Bias[1]) }; float scale[2] = { fabs(Scale), fabs(Scale) }; unsigned int v = 0, s = 0; values[v++] = &ComputeResult; values[v++] = bias; values[v++] = scale; if(ActiveKernel > 0) { values[v++] = &Lacunarity; values[v++] = &Increment; values[v++] = &Octaves; } values[v++] = &Amplitude; sizes[s++] = sizeof(cl_mem); sizes[s++] = sizeof(float) * 2; sizes[s++] = sizeof(float) * 2; if(ActiveKernel > 0) { sizes[s++] = sizeof(float); sizes[s++] = sizeof(float); sizes[s++] = sizeof(float); } sizes[s++] = sizeof(float); err = CL_SUCCESS; for (arg = 0; arg < s; arg++) { err |= clSetKernelArg(ComputeKernels[ActiveKernel], arg, sizes[arg], values[arg]); } if (err) return -10; global[0] = Width; global[1] = Height; local[0] = ComputeKernelWorkGroupSizes[ActiveKernel]; local[1] = 1; #if DEBUG_INFO if(FrameCount <= 1) printf("Global[%4d %4d] Local[%4d %4d]\n", (int)global[0], (int)global[1], (int)local[0], (int)local[1]); #endif err = clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[ActiveKernel], 2, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Failed to enqueue kernel! %d\n", err); return EXIT_FAILURE; } #if USE_GL_ATTACHMENTS err = clEnqueueAcquireGLObjects(ComputeCommands, 1, &ComputeImage, 0, 0, 0); if (err != CL_SUCCESS) { printf("Failed to acquire GL object! %d\n", err); return EXIT_FAILURE; } size_t origin[] = { 0, 0, 0 }; size_t region[] = { Width, Height, 1 }; err = clEnqueueCopyBufferToImage(ComputeCommands, ComputeResult, ComputeImage, 0, origin, region, 0, NULL, 0); if(err != CL_SUCCESS) { printf("Failed to copy buffer to image! %d\n", err); return EXIT_FAILURE; } err = clEnqueueReleaseGLObjects(ComputeCommands, 1, &ComputeImage, 0, 0, 0); if (err != CL_SUCCESS) { printf("Failed to release GL object! %d\n", err); return EXIT_FAILURE; } #else err = clEnqueueReadBuffer( ComputeCommands, ComputeResult, CL_TRUE, 0, Width * Height * TextureTypeSize * 4, HostImageBuffer, 0, NULL, NULL ); if (err) return -5; #endif return CL_SUCCESS; }
/* * Update Texture - requires valid OpenGL Context */ void TextureUpdate::updateTexture(const Measurement::ImageMeasurement& image) { #ifdef HAVE_OPENCV // access OCL Manager and initialize if needed Vision::OpenCLManager& oclManager = Vision::OpenCLManager::singleton(); if (!image) { // LOG4CPP_WARN ?? return; } // if OpenCL is enabled and image is on GPU, then use OCL codepath bool image_isOnGPU = oclManager.isInitialized() & image->isOnGPU(); if ( m_bTextureInitialized ) { // check if received image fits into the allocated texture // find out texture format int umatConvertCode = -1; GLenum glFormat = GL_LUMINANCE; GLenum glDatatype = GL_UNSIGNED_BYTE; int numOfChannels = 1; Image::ImageFormatProperties fmtSrc, fmtDst; image->getFormatProperties(fmtSrc); image->getFormatProperties(fmtDst); getImageFormat(fmtSrc, fmtDst, image_isOnGPU, umatConvertCode, glFormat, glDatatype); if (image_isOnGPU) { #ifdef HAVE_OPENCL glBindTexture( GL_TEXTURE_2D, m_texture ); // @todo this probably causes unwanted delay - .. except when executed on gpu ... if (umatConvertCode != -1) { cv::cvtColor(image->uMat(), m_convertedImage, umatConvertCode ); } else { m_convertedImage = image->uMat(); } cv::ocl::finish(); glFinish(); cl_command_queue commandQueue = oclManager.getCommandQueue(); cl_int err; clFinish(commandQueue); err = clEnqueueAcquireGLObjects(commandQueue, 1, &(m_clImage), 0, NULL, NULL); if(err != CL_SUCCESS) { LOG4CPP_ERROR( logger, "error at clEnqueueAcquireGLObjects:" << getOpenCLErrorString(err) ); } cl_mem clBuffer = (cl_mem) m_convertedImage.handle(cv::ACCESS_READ); cl_command_queue cv_ocl_queue = (cl_command_queue)cv::ocl::Queue::getDefault().ptr(); size_t offset = 0; size_t dst_origin[3] = {0, 0, 0}; size_t region[3] = {static_cast<size_t>(m_convertedImage.cols), static_cast<size_t>(m_convertedImage.rows), 1}; err = clEnqueueCopyBufferToImage(cv_ocl_queue, clBuffer, m_clImage, offset, dst_origin, region, 0, NULL, NULL); if (err != CL_SUCCESS) { LOG4CPP_ERROR( logger, "error at clEnqueueCopyBufferToImage:" << getOpenCLErrorString(err) ); } err = clEnqueueReleaseGLObjects(commandQueue, 1, &m_clImage, 0, NULL, NULL); if(err != CL_SUCCESS) { LOG4CPP_ERROR( logger, "error at clEnqueueReleaseGLObjects:" << getOpenCLErrorString(err) ); } cv::ocl::finish(); #else // HAVE_OPENCL LOG4CPP_ERROR( logger, "Image isOnGPU but OpenCL is disabled!!"); #endif // HAVE_OPENCL } else { // load image from CPU buffer into texture glBindTexture( GL_TEXTURE_2D, m_texture ); glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, image->width(), image->height(), glFormat, glDatatype, image->Mat().data ); } } #endif // HAVE_OPENCV }
cl_mem bindTexture(const oclMat &mat) { cl_mem texture; cl_image_format format; int err; int depth = mat.depth(); int channels = mat.oclchannels(); switch(depth) { case CV_8U: format.image_channel_data_type = CL_UNSIGNED_INT8; break; case CV_32S: format.image_channel_data_type = CL_UNSIGNED_INT32; break; case CV_32F: format.image_channel_data_type = CL_FLOAT; break; default: CV_Error(-1, "Image forma is not supported"); break; } switch(channels) { case 1: format.image_channel_order = CL_R; break; case 3: format.image_channel_order = CL_RGB; break; case 4: format.image_channel_order = CL_RGBA; break; default: CV_Error(-1, "Image format is not supported"); break; } #ifdef CL_VERSION_1_2 //this enables backwards portability to //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support if(Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2)) { cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = mat.cols; desc.image_height = mat.rows; desc.image_depth = 0; desc.image_array_size = 1; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.buffer = NULL; desc.num_mip_levels = 0; desc.num_samples = 0; texture = clCreateImage(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); } else #endif { texture = clCreateImage2D( *(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, mat.cols, mat.rows, 0, NULL, &err); } size_t origin[] = { 0, 0, 0 }; size_t region[] = { mat.cols, mat.rows, 1 }; cl_mem devData; if (mat.cols * mat.elemSize() != mat.step) { devData = clCreateBuffer(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_ONLY, mat.cols * mat.rows * mat.elemSize(), NULL, NULL); const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1}; clEnqueueCopyBufferRect(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), (cl_mem)mat.data, devData, origin, origin, regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL); clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr()); } else { devData = (cl_mem)mat.data; } clEnqueueCopyBufferToImage(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), devData, texture, 0, origin, region, 0, NULL, 0); if ((mat.cols * mat.elemSize() != mat.step)) { clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr()); clReleaseMemObject(devData); } openCLSafeCall(err); return texture; }
END_TEST START_TEST (test_copy_image_buffer) { cl_platform_id platform = 0; cl_device_id device; cl_context ctx; cl_command_queue queue; cl_mem image, buffer; cl_int result; cl_event event; unsigned char image_buffer[3*3*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 0, 0, 255, 0, 128, 0, 0, 0, 0, 128, 0, 0, 0, 0, 128, 0, 64, 0, 0, 0, 0, 64, 0, 0, 0, 0, 64, 0 }; // Square that will be put in image_buffer at (1, 0) unsigned char buffer_buffer[2*2*4+1] = { 33, // Oh, a padding ! 255, 255, 255, 0, 255, 0, 255, 0, 0, 255, 255, 0, 255, 255, 0, 0 }; // What we must get once re-reading 2x2 rect at (1, 1) unsigned char correct_data[2*2*4] = { 0, 255, 255, 0, 255, 255, 0, 0, 0, 64, 0, 0, 0, 0, 64, 0 }; cl_image_format fmt; fmt.image_channel_data_type = CL_UNORM_INT8; fmt.image_channel_order = CL_RGBA; size_t origin[3] = {1, 0, 0}; size_t region[3] = {2, 2, 1}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); image = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 3, 3, 0, image_buffer, &result); fail_if( result != CL_SUCCESS, "unable to create a 3x3 bgra image" ); buffer = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer_buffer), buffer_buffer, &result); fail_if( result != CL_SUCCESS, "unable to create a buffer object" ); // Write buffer in image result = clEnqueueCopyBufferToImage(queue, buffer, image, 1, origin, region, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy buffer to image event, buffer offset 1, image 2x2 @ (1, 0)" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "cannot wait for event" ); clReleaseEvent(event); // Read it back into buffer, again with an offset origin[1] = 1; result = clEnqueueCopyImageToBuffer(queue, image, buffer, origin, region, 1, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy image to buffer event, buffer offset 1, image 2x2 @ (1, 1)" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "cannot wait for event" ); fail_if( std::memcmp(buffer_buffer + 1, correct_data, sizeof(correct_data)) != 0, "copying data around isn't working the expected way" ); // Map the image and check pointers unsigned char *mapped; size_t row_pitch; origin[0] = 0; origin[1] = 0; origin[2] = 0; mapped = (unsigned char *)clEnqueueMapImage(queue, image, 1, CL_MAP_READ, origin, region, &row_pitch, 0, 0, 0, 0, &result); fail_if( result != CL_SUCCESS, "unable to map an image" ); fail_if( mapped != image_buffer, "mapped aread doesn't match host ptr" ); clReleaseEvent(event); clReleaseMemObject(image); clReleaseMemObject(buffer); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
void OpenCLExecuter::ocl_filter_shared(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer cl_sampler sampler; // OpenCL sampler cl_kernel ckKernel; // OpenCL kernel int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process // set Local work size dimensions // size_t local_threads[3] ={256,256,64}; // set Global work size dimensions // size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]}; // set Global work size dimensions size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err); printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // create a sampler object sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err); printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // grab input data from OpenGL, compute, copy the results back to OpenGL // Runs asynchronous to host, up until blocking clFinish at the end glFinish(); glFlush(); // grab the OpenGL texture object for read/writing from OpenCL err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL); printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err)); // Execute a kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); /* // Blocking read of results from GPU to Host int size = volobj->texwidth*volobj->texheight*volobj->texdepth; unsigned char* result = new unsigned char[size]; err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i]; delete[] result; */ // copy OpenCL buffer to OpenGl texture size_t corigin[3] = {0,0,0}; size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth}; err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL); printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err)); //make sure we block until we are done. //err = clFinish(ocl_wrapper->commandQue); //printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err)); //release opengl objects now err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0); printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(src_buffer)clReleaseMemObject(src_buffer); if(dst_buffer)clReleaseMemObject(dst_buffer); }