static bool sumTemplate(InputArray _src, UMat & result) { int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int wdepth = CV_32F, wtype = CV_MAKE_TYPE(wdepth, cn); size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); int wgs2_aligned = 1; while (wgs2_aligned < (int)wgs) wgs2_aligned <<= 1; wgs2_aligned >>= 1; char cvt[40]; ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc, format("-D CALC_SUM -D T=%s -D T1=%s -D WT=%s -D cn=%d -D convertToWT=%s -D WGS=%d -D WGS2_ALIGNED=%d", ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), cn, ocl::convertTypeStr(depth, wdepth, cn, cvt), (int)wgs, wgs2_aligned)); if (k.empty()) return false; UMat src = _src.getUMat(); result.create(1, 1, CV_32FC1); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), resarg = ocl::KernelArg::PtrWriteOnly(result); k.args(srcarg, src.cols, (int)src.total(), resarg); size_t globalsize = wgs; return k.run(1, &globalsize, &wgs, false); }
static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn, int & almostTemplateWindowSizeSqBinShift) { const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255; int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue; int depth = DataType<FT>::depth; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (depth == CV_64F && !doubleSupport) return false; // precalc weight for every possible l2 dist between blocks // additional optimization of precalced weights to replace division(averaging) by binary shift CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) int templateWindowSizeSq = templateWindowSize * templateWindowSize; almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; const FT WEIGHT_THRESHOLD = 1e-3f; int maxDist = 255 * 255 * cn; int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); FT den = 1.0f / (h * h * cn); almostDist2Weight.create(1, almostMaxDist, CV_32SC1); ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc, format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist, almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD); size_t globalsize[1] = { almostMaxDist }; return k.run(1, globalsize, NULL, false); }
void ConvolveBuf::create(Size image_size, Size templ_size) { result_size = Size(image_size.width - templ_size.width + 1, image_size.height - templ_size.height + 1); const double blockScale = 4.5; const int minBlockSize = 256; block_size.width = cvRound(result_size.width*blockScale); block_size.width = std::max( block_size.width, minBlockSize - templ_size.width + 1 ); block_size.width = std::min( block_size.width, result_size.width ); block_size.height = cvRound(templ_size.height*blockScale); block_size.height = std::max( block_size.height, minBlockSize - templ_size.height + 1 ); block_size.height = std::min( block_size.height, result_size.height ); dft_size.width = std::max(getOptimalDFTSize(block_size.width + templ_size.width - 1), 2); dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1); if( dft_size.width <= 0 || dft_size.height <= 0 ) CV_Error( CV_StsOutOfRange, "the input arrays are too big" ); // recompute block size block_size.width = dft_size.width - templ_size.width + 1; block_size.width = std::min( block_size.width, result_size.width); block_size.height = dft_size.height - templ_size.height + 1; block_size.height = std::min( block_size.height, result_size.height ); image_block.create(dft_size, CV_32F); templ_block.create(dft_size, CV_32F); result_data.create(dft_size, CV_32F); image_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2); templ_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2); result_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2); // Use maximum result matrix block size for the estimated DFT block size block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); }
bool BTVL1_Base::ocl_process(InputArrayOfArrays _src, OutputArray _dst, InputArrayOfArrays _forwardMotions, InputArrayOfArrays _backwardMotions, int baseIdx) { std::vector<UMat> & src = *(std::vector<UMat> *)_src.getObj(), & forwardMotions = *(std::vector<UMat> *)_forwardMotions.getObj(), & backwardMotions = *(std::vector<UMat> *)_backwardMotions.getObj(); // update blur filter and btv weights if (!filter_ || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_) { filter_ = createGaussianFilter(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_); curBlurKernelSize_ = blurKernelSize_; curBlurSigma_ = blurSigma_; curSrcType_ = src[0].type(); } if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_) { calcBtvWeights(btvKernelSize_, alpha_, btvWeights_); Mat(btvWeights_, true).copyTo(ubtvWeights_); curBtvKernelSize_ = btvKernelSize_; curAlpha_ = alpha_; } // calc high res motions calcRelativeMotions(forwardMotions, backwardMotions, ulowResForwardMotions_, ulowResBackwardMotions_, baseIdx, src[0].size()); upscaleMotions(ulowResForwardMotions_, uhighResForwardMotions_, scale_); upscaleMotions(ulowResBackwardMotions_, uhighResBackwardMotions_, scale_); uforwardMaps_.resize(uhighResForwardMotions_.size()); ubackwardMaps_.resize(uhighResForwardMotions_.size()); for (size_t i = 0; i < uhighResForwardMotions_.size(); ++i) buildMotionMaps(uhighResForwardMotions_[i], uhighResBackwardMotions_[i], uforwardMaps_[i], ubackwardMaps_[i]); // initial estimation const Size lowResSize = src[0].size(); const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_); resize(src[baseIdx], uhighRes_, highResSize, 0, 0, INTER_LINEAR); // TODO // iterations udiffTerm_.create(highResSize, uhighRes_.type()); ua_.create(highResSize, uhighRes_.type()); ub_.create(highResSize, uhighRes_.type()); uc_.create(lowResSize, uhighRes_.type()); for (int i = 0; i < iterations_; ++i) { udiffTerm_.setTo(Scalar::all(0)); for (size_t k = 0; k < src.size(); ++k) { // a = M * Ih remap(uhighRes_, ua_, ubackwardMaps_[k], noArray(), INTER_NEAREST); // b = HM * Ih GaussianBlur(ua_, ub_, Size(blurKernelSize_, blurKernelSize_), blurSigma_); // c = DHM * Ih resize(ub_, uc_, lowResSize, 0, 0, INTER_NEAREST); diffSign(src[k], uc_, uc_); // a = Dt * diff upscale(uc_, ua_, scale_); // b = HtDt * diff GaussianBlur(ua_, ub_, Size(blurKernelSize_, blurKernelSize_), blurSigma_); // a = MtHtDt * diff remap(ub_, ua_, uforwardMaps_[k], noArray(), INTER_NEAREST); add(udiffTerm_, ua_, udiffTerm_); } if (lambda_ > 0) { calcBtvRegularization(uhighRes_, uregTerm_, btvKernelSize_, btvWeights_, ubtvWeights_); addWeighted(udiffTerm_, 1.0, uregTerm_, -lambda_, 0.0, udiffTerm_); } addWeighted(uhighRes_, 1.0, udiffTerm_, tau_, 0.0, uhighRes_); } Rect inner(btvKernelSize_, btvKernelSize_, uhighRes_.cols - 2 * btvKernelSize_, uhighRes_.rows - 2 * btvKernelSize_); uhighRes_(inner).copyTo(_dst); return true; }
bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors) { int dsize = params->descriptorSize(); int nFeatures = keypoints.cols; if (nFeatures == 0) { _descriptors.release(); return true; } _descriptors.create(nFeatures, dsize, CV_32F); UMat descriptors; if( _descriptors.isUMat() ) descriptors = _descriptors.getUMat(); else descriptors.create(nFeatures, dsize, CV_32F); ocl::Kernel kerCalcDesc, kerNormDesc; if( dsize == 64 ) { kerCalcDesc.create("SURF_computeDescriptors64", ocl::xfeatures2d::surf_oclsrc, kerOpts); kerNormDesc.create("SURF_normalizeDescriptors64", ocl::xfeatures2d::surf_oclsrc, kerOpts); } else { CV_Assert(dsize == 128); kerCalcDesc.create("SURF_computeDescriptors128", ocl::xfeatures2d::surf_oclsrc, kerOpts); kerNormDesc.create("SURF_normalizeDescriptors128", ocl::xfeatures2d::surf_oclsrc, kerOpts); } size_t localThreads[] = {6, 6}; size_t globalThreads[] = {nFeatures*localThreads[0], localThreads[1]}; if(haveImageSupport) { kerCalcDesc.args(imgTex, img_rows, img_cols, ocl::KernelArg::ReadOnlyNoSize(keypoints), ocl::KernelArg::WriteOnlyNoSize(descriptors)); } else { kerCalcDesc.args(ocl::KernelArg::ReadOnlyNoSize(img), img_rows, img_cols, ocl::KernelArg::ReadOnlyNoSize(keypoints), ocl::KernelArg::WriteOnlyNoSize(descriptors)); } if(!kerCalcDesc.run(2, globalThreads, localThreads, true)) return false; size_t localThreads_n[] = {dsize, 1}; size_t globalThreads_n[] = {nFeatures*localThreads_n[0], localThreads_n[1]}; globalThreads[0] = nFeatures * localThreads[0]; globalThreads[1] = localThreads[1]; bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)). run(2, globalThreads_n, localThreads_n, true); if(ok && !_descriptors.isUMat()) descriptors.copyTo(_descriptors); return ok; }
static bool ocl_Canny(InputArray _src, const UMat& dx_, const UMat& dy_, OutputArray _dst, float low_thresh, float high_thresh, int aperture_size, bool L2gradient, int cn, const Size & size) { CV_INSTRUMENT_REGION_OPENCL() UMat map; const ocl::Device &dev = ocl::Device::getDefault(); int max_wg_size = (int)dev.maxWorkGroupSize(); int lSizeX = 32; int lSizeY = max_wg_size / 32; if (lSizeY == 0) { lSizeX = 16; lSizeY = max_wg_size / 16; } if (lSizeY == 0) { lSizeY = 1; } if (aperture_size == 7) { low_thresh = low_thresh / 16.0f; high_thresh = high_thresh / 16.0f; } if (L2gradient) { low_thresh = std::min(32767.0f, low_thresh); high_thresh = std::min(32767.0f, high_thresh); if (low_thresh > 0) low_thresh *= low_thresh; if (high_thresh > 0) high_thresh *= high_thresh; } int low = cvFloor(low_thresh), high = cvFloor(high_thresh); if (!useCustomDeriv && aperture_size == 3 && !_src.isSubmatrix()) { /* stage1_with_sobel: Sobel operator Calc magnitudes Non maxima suppression Double thresholding */ char cvt[40]; ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc, format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_floatN=%s -D floatN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s", cn, ocl::memopTypeToStr(_src.depth()), ocl::convertTypeStr(_src.depth(), CV_32F, cn, cvt), ocl::typeToStr(CV_MAKE_TYPE(CV_32F, cn)), lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : "")); if (with_sobel.empty()) return false; UMat src = _src.getUMat(); map.create(size, CV_32S); with_sobel.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(map), (float) low, (float) high); size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }, localsize[2] = { (size_t)lSizeX, (size_t)lSizeY }; if (!with_sobel.run(2, globalsize, localsize, false)) return false; } else { /* stage1_without_sobel: Calc magnitudes Non maxima suppression Double thresholding */ double scale = 1.0; if (aperture_size == 7) { scale = 1 / 16.0; } UMat dx, dy; if (!useCustomDeriv) { Sobel(_src, dx, CV_16S, 1, 0, aperture_size, scale, 0, BORDER_REPLICATE); Sobel(_src, dy, CV_16S, 0, 1, aperture_size, scale, 0, BORDER_REPLICATE); } else { dx = dx_; dy = dy_; } ocl::Kernel without_sobel("stage1_without_sobel", ocl::imgproc::canny_oclsrc, format("-D WITHOUT_SOBEL -D cn=%d -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s", cn, lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : "")); if (without_sobel.empty()) return false; map.create(size, CV_32S); without_sobel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy), ocl::KernelArg::WriteOnly(map), low, high); size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }, localsize[2] = { (size_t)lSizeX, (size_t)lSizeY }; if (!without_sobel.run(2, globalsize, localsize, false)) return false; } int PIX_PER_WI = 8; /* stage2: hysteresis (add weak edges if they are connected with strong edges) */ int sizey = lSizeY / PIX_PER_WI; if (sizey == 0) sizey = 1; size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + PIX_PER_WI - 1) / PIX_PER_WI }, localsize[2] = { (size_t)lSizeX, (size_t)sizey }; ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc, format("-D STAGE2 -D PIX_PER_WI=%d -D LOCAL_X=%d -D LOCAL_Y=%d", PIX_PER_WI, lSizeX, sizey)); if (edgesHysteresis.empty()) return false; edgesHysteresis.args(ocl::KernelArg::ReadWrite(map)); if (!edgesHysteresis.run(2, globalsize, localsize, false)) return false; // get edges ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, format("-D GET_EDGES -D PIX_PER_WI=%d", PIX_PER_WI)); if (getEdgesKernel.empty()) return false; _dst.create(size, CV_8UC1); UMat dst = _dst.getUMat(); getEdgesKernel.args(ocl::KernelArg::ReadOnly(map), ocl::KernelArg::WriteOnlyNoSize(dst)); return getEdgesKernel.run(2, globalsize, NULL, false); }