static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, const Mat & kd, const Mat & ks, double scale, double delta, int borderType, int depth, int ddepth) { const size_t tileSizeX = 16; const size_t tileSizeYmin = 8; const ocl::Device dev = ocl::Device::getDefault(); int stype = _src.type(); int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), esz = CV_ELEM_SIZE(stype); bool doubleSupport = dev.doubleFPConfig() > 0; if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; Mat kernelX = kd.reshape(1, 1); if (kernelX.cols % 2 != 1) return false; Mat kernelY = ks.reshape(1, 1); if (kernelY.cols % 2 != 1) return false; CV_Assert(kernelX.cols == kernelY.cols); size_t wgs = dev.maxWorkGroupSize(); size_t lmsz = dev.localMemSize(); size_t src_step = _src.step(), src_offset = _src.offset(); const size_t tileSizeYmax = wgs / tileSizeX; // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn; if (((src_offset % src_step) % esz == 0) && ( (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) || ((borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101) && (_src.cols() >= (int) (kernelX.cols + tileSizeX) && _src.rows() >= (int) (kernelY.cols + tileSizeYmax))) ) && (tileSizeX * tileSizeYmin <= wgs) && (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz) ) { Size size = _src.size(), wholeSize; Point origin; int dtype = CV_MAKE_TYPE(ddepth, cn); int wdepth = CV_32F; size_t tileSizeY = tileSizeYmax; while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz)) { tileSizeY /= 2; } size_t lt2[2] = { tileSizeX, tileSizeY}; size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] }; char cvt[2][40]; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }; String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUS=%d%s%s" " -D convertToWT=%s -D convertToDT=%s" " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s" " -D srcT=%s -D dstT=%s -D WT=%s" " -D CN=%d ", (int)lt2[0], (int)lt2[1], kernelX.cols / 2, ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(), ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), ocl::typeToStr(CV_MAKETYPE(sdepth, cn)), ocl::typeToStr(CV_MAKETYPE(ddepth, cn)), ocl::typeToStr(CV_MAKETYPE(wdepth, cn)), cn); ocl::Kernel k("laplacian", ocl::imgproc::laplacian5_oclsrc, opts); if (k.empty()) return false; UMat src = _src.getUMat(); _dst.create(size, dtype); UMat dst = _dst.getUMat(); int src_offset_x = static_cast<int>((src_offset % src_step) / esz); int src_offset_y = static_cast<int>(src_offset / src_step); src.locateROI(wholeSize, origin); k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), static_cast<float>(scale), static_cast<float>(delta)); return k.run(2, gt2, lt2, false); } int iscale = cvRound(scale), idelta = cvRound(delta); bool floatCoeff = std::fabs(delta - idelta) > DBL_EPSILON || std::fabs(scale - iscale) > DBL_EPSILON; int wdepth = std::max(depth, floatCoeff ? CV_32F : CV_32S), kercn = 1; if (!doubleSupport && wdepth == CV_64F) return false; char cvt[2][40]; ocl::Kernel k("sumConvert", ocl::imgproc::laplacian5_oclsrc, format("-D ONLY_SUM_CONVERT " "-D srcT=%s -D WT=%s -D dstT=%s -D coeffT=%s -D wdepth=%d " "-D convertToWT=%s -D convertToDT=%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvt[1]), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; UMat d2x, d2y; sepFilter2D(_src, d2x, depth, kd, ks, Point(-1, -1), 0, borderType); sepFilter2D(_src, d2y, depth, ks, kd, Point(-1, -1), 0, borderType); UMat dst = _dst.getUMat(); ocl::KernelArg d2xarg = ocl::KernelArg::ReadOnlyNoSize(d2x), d2yarg = ocl::KernelArg::ReadOnlyNoSize(d2y), dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); if (wdepth >= CV_32F) k.args(d2xarg, d2yarg, dstarg, (float)scale, (float)delta); else k.args(d2xarg, d2yarg, dstarg, iscale, idelta); size_t globalsize[] = { dst.cols * cn / kercn, dst.rows }; return k.run(2, globalsize, NULL, false); }