static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth ) { if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 || !(sdepth == CV_32S || sdepth == CV_32F) ) return false; ocl::Kernel k1("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, format("-D sdepth=%d", sdepth)); if (k1.empty()) return false; Size size = _src.size(), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width), ssize(size.width + 1, size.height + 1); _sum.create(ssize, sdepth); UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat(); t_sum = t_sum(Range::all(), Range(0, size.height)); int offset = (int)src.offset / vlen, pre_invalid = (int)src.offset % vlen; int vcols = (pre_invalid + src.cols + vlen - 1) / vlen; int sum_offset = (int)sum.offset / vlen; k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum), offset, pre_invalid, src.rows, src.cols, (int)src.step, (int)t_sum.step); size_t gt = ((vcols + 1) / 2) * 256, lt = 256; if (!k1.run(1, >, <, false)) return false; ocl::Kernel k2("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, format("-D sdepth=%d", sdepth)); k2.args(ocl::KernelArg::PtrReadWrite(t_sum), ocl::KernelArg::PtrWriteOnly(sum), t_sum.rows, t_sum.cols, (int)t_sum.step, (int)sum.step, sum_offset); size_t gt2 = t_sum.cols * 32, lt2 = 256; return k2.run(1, >2, <2, false); }
static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth ) { bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) ) return false; char cvt[40]; String opts = format("-D sdepth=%d -D sqdepth=%d -D TYPE=%s -D TYPE4=%s4 -D convert_TYPE4=%s%s", sdepth, sqdepth, ocl::typeToStr(sqdepth), ocl::typeToStr(sqdepth), ocl::convertTypeStr(sdepth, sqdepth, 4, cvt), doubleSupport ? " -D DOUBLE_SUPPORT" : ""); ocl::Kernel k1("integral_cols", ocl::imgproc::integral_sqrsum_oclsrc, opts); if (k1.empty()) return false; Size size = _src.size(), dsize = Size(size.width + 1, size.height + 1), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width); UMat src = _src.getUMat(), t_sum(t_size, sdepth), t_sqsum(t_size, sqdepth); t_sum = t_sum(Range::all(), Range(0, size.height)); t_sqsum = t_sqsum(Range::all(), Range(0, size.height)); _sum.create(dsize, sdepth); _sqsum.create(dsize, sqdepth); UMat sum = _sum.getUMat(), sqsum = _sqsum.getUMat(); int offset = src.offset / vlen; int pre_invalid = src.offset % vlen; int vcols = (pre_invalid + src.cols + vlen - 1) / vlen; int sum_offset = sum.offset / sum.elemSize(); int sqsum_offset = sqsum.offset / sqsum.elemSize(); CV_Assert(sqsum.offset % sqsum.elemSize() == 0); k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum), ocl::KernelArg::PtrWriteOnly(t_sqsum), offset, pre_invalid, src.rows, src.cols, (int)src.step, (int)t_sum.step, (int)t_sqsum.step); size_t gt = ((vcols + 1) / 2) * 256, lt = 256; if (!k1.run(1, >, <, false)) return false; ocl::Kernel k2("integral_rows", ocl::imgproc::integral_sqrsum_oclsrc, opts); if (k2.empty()) return false; k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrReadOnly(t_sqsum), ocl::KernelArg::PtrWriteOnly(sum), ocl::KernelArg::PtrWriteOnly(sqsum), t_sum.rows, t_sum.cols, (int)t_sum.step, (int)t_sqsum.step, (int)sum.step, (int)sqsum.step, sum_offset, sqsum_offset); size_t gt2 = t_sum.cols * 32, lt2 = 256; return k2.run(1, >2, <2, false); }
static bool ocl_sepFilter3x3_8UC1(InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, double delta, int borderType) { const ocl::Device & dev = ocl::Device::getDefault(); int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); if ( !(dev.isIntel() && (type == CV_8UC1) && (ddepth == CV_8U) && (_src.offset() == 0) && (_src.step() % 4 == 0) && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) ) return false; Mat kernelX = _kernelX.getMat().reshape(1, 1); if (kernelX.cols % 2 != 1) return false; Mat kernelY = _kernelY.getMat().reshape(1, 1); if (kernelY.cols % 2 != 1) return false; if (ddepth < 0) ddepth = sdepth; Size size = _src.size(); size_t globalsize[2] = { 0, 0 }; size_t localsize[2] = { 0, 0 }; globalsize[0] = size.width / 16; globalsize[1] = size.height / 2; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; char build_opts[1024]; sprintf(build_opts, "-D %s %s%s", borderMap[borderType], ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); ocl::Kernel kernel("sepFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::sepFilter3x3_oclsrc, build_opts); if (kernel.empty()) return false; UMat src = _src.getUMat(); _dst.create(size, CV_MAKETYPE(ddepth, cn)); if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) return false; UMat dst = _dst.getUMat(); int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); idxArg = kernel.set(idxArg, (int)src.step); idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); idxArg = kernel.set(idxArg, (int)dst.step); idxArg = kernel.set(idxArg, (int)dst.rows); idxArg = kernel.set(idxArg, (int)dst.cols); idxArg = kernel.set(idxArg, static_cast<float>(delta)); return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); }
static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, const Mat & kd, const Mat & ks, double scale, double delta, int borderType, int depth, int ddepth) { const size_t tileSizeX = 16; const size_t tileSizeYmin = 8; const ocl::Device dev = ocl::Device::getDefault(); int stype = _src.type(); int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), esz = CV_ELEM_SIZE(stype); bool doubleSupport = dev.doubleFPConfig() > 0; if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; Mat kernelX = kd.reshape(1, 1); if (kernelX.cols % 2 != 1) return false; Mat kernelY = ks.reshape(1, 1); if (kernelY.cols % 2 != 1) return false; CV_Assert(kernelX.cols == kernelY.cols); size_t wgs = dev.maxWorkGroupSize(); size_t lmsz = dev.localMemSize(); size_t src_step = _src.step(), src_offset = _src.offset(); const size_t tileSizeYmax = wgs / tileSizeX; // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn; if (((src_offset % src_step) % esz == 0) && ( (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) || ((borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101) && (_src.cols() >= (int) (kernelX.cols + tileSizeX) && _src.rows() >= (int) (kernelY.cols + tileSizeYmax))) ) && (tileSizeX * tileSizeYmin <= wgs) && (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz) ) { Size size = _src.size(), wholeSize; Point origin; int dtype = CV_MAKE_TYPE(ddepth, cn); int wdepth = CV_32F; size_t tileSizeY = tileSizeYmax; while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz)) { tileSizeY /= 2; } size_t lt2[2] = { tileSizeX, tileSizeY}; size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] }; char cvt[2][40]; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }; String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUS=%d%s%s" " -D convertToWT=%s -D convertToDT=%s" " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s" " -D srcT=%s -D dstT=%s -D WT=%s" " -D CN=%d ", (int)lt2[0], (int)lt2[1], kernelX.cols / 2, ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(), ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), ocl::typeToStr(CV_MAKETYPE(sdepth, cn)), ocl::typeToStr(CV_MAKETYPE(ddepth, cn)), ocl::typeToStr(CV_MAKETYPE(wdepth, cn)), cn); ocl::Kernel k("laplacian", ocl::imgproc::laplacian5_oclsrc, opts); if (k.empty()) return false; UMat src = _src.getUMat(); _dst.create(size, dtype); UMat dst = _dst.getUMat(); int src_offset_x = static_cast<int>((src_offset % src_step) / esz); int src_offset_y = static_cast<int>(src_offset / src_step); src.locateROI(wholeSize, origin); k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), static_cast<float>(scale), static_cast<float>(delta)); return k.run(2, gt2, lt2, false); } int iscale = cvRound(scale), idelta = cvRound(delta); bool floatCoeff = std::fabs(delta - idelta) > DBL_EPSILON || std::fabs(scale - iscale) > DBL_EPSILON; int wdepth = std::max(depth, floatCoeff ? CV_32F : CV_32S), kercn = 1; if (!doubleSupport && wdepth == CV_64F) return false; char cvt[2][40]; ocl::Kernel k("sumConvert", ocl::imgproc::laplacian5_oclsrc, format("-D ONLY_SUM_CONVERT " "-D srcT=%s -D WT=%s -D dstT=%s -D coeffT=%s -D wdepth=%d " "-D convertToWT=%s -D convertToDT=%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvt[1]), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; UMat d2x, d2y; sepFilter2D(_src, d2x, depth, kd, ks, Point(-1, -1), 0, borderType); sepFilter2D(_src, d2y, depth, ks, kd, Point(-1, -1), 0, borderType); UMat dst = _dst.getUMat(); ocl::KernelArg d2xarg = ocl::KernelArg::ReadOnlyNoSize(d2x), d2yarg = ocl::KernelArg::ReadOnlyNoSize(d2y), dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); if (wdepth >= CV_32F) k.args(d2xarg, d2yarg, dstarg, (float)scale, (float)delta); else k.args(d2xarg, d2yarg, dstarg, iscale, idelta); size_t globalsize[] = { dst.cols * cn / kercn, dst.rows }; return k.run(2, globalsize, NULL, false); }