void cv::gpu::ORB_GPU::buildScalePyramids(const GpuMat& image, const GpuMat& mask) { CV_Assert(image.type() == CV_8UC1); CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size())); imagePyr_.resize(nLevels_); maskPyr_.resize(nLevels_); for (int level = 0; level < nLevels_; ++level) { float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level); Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale)); ensureSizeIsEnough(sz, image.type(), imagePyr_[level]); ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]); maskPyr_[level].setTo(Scalar::all(255)); // Compute the resized image if (level != firstLevel_) { if (level < firstLevel_) { resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR); if (!mask.empty()) resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR); } else { resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR); if (!mask.empty()) { resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR); threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO); } } } else { image.copyTo(imagePyr_[level]); if (!mask.empty()) mask.copyTo(maskPyr_[level]); } // Filter keypoints by image border ensureSizeIsEnough(sz, CV_8UC1, buf_); buf_.setTo(Scalar::all(0)); Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_); buf_(inner).setTo(Scalar::all(255)); bitwise_and(maskPyr_[level], buf_, maskPyr_[level]); } }
void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf) { double scale = 1, shift = 0; if (norm_type == NORM_MINMAX) { double smin = 0, smax = 0; double dmin = std::min(a, b), dmax = std::max(a, b); minMax(src, &smin, &smax, mask, norm_buf); scale = (dmax - dmin) * (smax - smin > numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0); shift = dmin - smin * scale; } else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF) { scale = norm(src, norm_type, mask, norm_buf); scale = scale > numeric_limits<double>::epsilon() ? a / scale : 0.0; shift = 0; } else { CV_Error(CV_StsBadArg, "Unknown/unsupported norm type"); } if (mask.empty()) { src.convertTo(dst, dtype, scale, shift); } else { src.convertTo(cvt_buf, dtype, scale, shift); cvt_buf.copyTo(dst, mask); } }
void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) { CV_Assert(dst); bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE); CV_Assert(src.depth() != CV_64F || double_ok); int depth = src.depth(); int num_channels = src.channels(); Size size = src.size(); if (num_channels == 1) { src.copyTo(dst[0]); return; } for (int i = 0; i < num_channels; ++i) dst[i].create(src.size(), depth); CV_Assert(num_channels <= 4); DevMem2D dst_as_devmem[4]; for (int i = 0; i < num_channels; ++i) dst_as_devmem[i] = dst[i]; DevMem2D src_as_devmem(src); split_merge::split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream); }
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta) { CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon(); if( rtype < 0 ) rtype = src.type(); else rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels()); int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype); if( sdepth == ddepth && noScale ) { src.copyTo(dst); return; } GpuMat temp; const GpuMat* psrc = &src; if( sdepth != ddepth && psrc == &dst ) psrc = &(temp = src); dst.create( src.size(), rtype ); convertTo(src, dst, alpha, beta, Impl::getStream(impl)); }
void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const { using namespace cv::gpu::device::pyramid; typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[6][4] = { {kernelInterpolateFrom1_gpu<uchar1> , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3> , kernelInterpolateFrom1_gpu<uchar4> }, {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/ , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ }, {kernelInterpolateFrom1_gpu<ushort1> , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3> , kernelInterpolateFrom1_gpu<ushort4> }, {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/}, {0 /*kernelInterpolateFrom1_gpu<int1>*/ , 0 /*kernelInterpolateFrom1_gpu<int2>*/ , 0 /*kernelInterpolateFrom1_gpu<int3>*/ , 0 /*kernelInterpolateFrom1_gpu<int4>*/ }, {kernelInterpolateFrom1_gpu<float1> , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3> , kernelInterpolateFrom1_gpu<float4> } }; CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0); ensureSizeIsEnough(outRoi, layer0_.type(), outImg); const func_t func = funcs[outImg.depth()][outImg.channels() - 1]; CV_Assert(func != 0); if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows) { if (stream) stream.enqueueCopy(layer0_, outImg); else layer0_.copyTo(outImg); } float lastScale = 1.0f; float curScale; GpuMat lastLayer = layer0_; GpuMat curLayer; for (int i = 0; i < nLayers_ - 1; ++i) { curScale = lastScale * 0.5f; curLayer = pyramid_[i]; if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows) { if (stream) stream.enqueueCopy(curLayer, outImg); else curLayer.copyTo(outImg); } if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows) break; lastScale = curScale; lastLayer = curLayer; } func(lastLayer, outImg, StreamAccessor::getStream(stream)); }
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2) { if (ksize.width == 1 && ksize.height == 1) { src.copyTo(dst); return; } dst.create(src.size(), src.type()); Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2); f->apply(src, dst); }
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream) { if (ksize.width == 1 && ksize.height == 1) { src.copyTo(dst); return; } dst.create(src.size(), src.type()); Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); }
void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream) { GpuMat src = _src.getGpuMat(); typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { {device::resize<uchar> , 0 /*device::resize<uchar2>*/ , device::resize<uchar3> , device::resize<uchar4> }, {0 /*device::resize<schar>*/, 0 /*device::resize<char2>*/ , 0 /*device::resize<char3>*/, 0 /*device::resize<char4>*/}, {device::resize<ushort> , 0 /*device::resize<ushort2>*/, device::resize<ushort3> , device::resize<ushort4> }, {device::resize<short> , 0 /*device::resize<short2>*/ , device::resize<short3> , device::resize<short4> }, {0 /*device::resize<int>*/ , 0 /*device::resize<int2>*/ , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ }, {device::resize<float> , 0 /*device::resize<float2>*/ , device::resize<float3> , device::resize<float4> } }; CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA ); CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) ); if (dsize == Size()) { dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy)); } else { fx = static_cast<double>(dsize.width) / src.cols; fy = static_cast<double>(dsize.height) / src.rows; } _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); if (dsize == src.size()) { src.copyTo(dst, stream); return; } const func_t func = funcs[src.depth()][src.channels() - 1]; if (!func) CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); PtrStepSzb wholeSrc(wholeSize.height, wholeSize.width, src.datastart, src.step); func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream)); }
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta) { bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon(); if( rtype < 0 ) rtype = src.type(); else rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels()); int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype); if( sdepth == ddepth && noScale ) { src.copyTo(dst); return; } GpuMat temp; const GpuMat* psrc = &src; if( sdepth != ddepth && psrc == &dst ) psrc = &(temp = src); dst.create( src.size(), rtype ); matrix_operations::convert_to(*psrc, sdepth, dst, ddepth, psrc->channels(), alpha, beta, impl->stream); }
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) { #ifndef HAVE_CUBLAS (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream; CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS"); #else // CUBLAS works with column-major matrices CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type())); if (src1.depth() == CV_64F) { if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } bool tr1 = (flags & GEMM_1_T) != 0; bool tr2 = (flags & GEMM_2_T) != 0; bool tr3 = (flags & GEMM_3_T) != 0; if (src1.type() == CV_64FC2) { if (tr1 || tr2 || tr3) CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type"); } Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size(); Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size(); Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size(); Size dstSize(src2Size.width, src1Size.height); CV_Assert(src1Size.width == src2Size.height); CV_Assert(src3.empty() || src3Size == dstSize); dst.create(dstSize, src1.type()); if (beta != 0) { if (src3.empty()) { if (stream) stream.enqueueMemSet(dst, Scalar::all(0)); else dst.setTo(Scalar::all(0)); } else { if (tr3) { transpose(src3, dst, stream); } else { if (stream) stream.enqueueCopy(src3, dst); else src3.copyTo(dst); } } } cublasHandle_t handle; cublasSafeCall( cublasCreate_v2(&handle) ); cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) ); cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) ); const float alphaf = static_cast<float>(alpha); const float betaf = static_cast<float>(beta); const cuComplex alphacf = make_cuComplex(alphaf, 0); const cuComplex betacf = make_cuComplex(betaf, 0); const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0); const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0); cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N; switch (src1.type()) { case CV_32FC1: cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphaf, src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)), src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)), &betaf, dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) ); break; case CV_64FC1: cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alpha, src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)), src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)), &beta, dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) ); break; case CV_32FC2: cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphacf, src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)), src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)), &betacf, dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) ); break; case CV_64FC2: cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphac, src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)), src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)), &betac, dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) ); break; } cublasSafeCall( cublasDestroy_v2(handle) ); #endif }
void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream) { #ifndef HAVE_CUFFT (void) _src; (void) _dst; (void) dft_size; (void) flags; (void) stream; throw_no_cuda(); #else GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); // We don't support unpacked output (in the case of real input) CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) ); const bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); const bool is_row_dft = (flags & DFT_ROWS) != 0; const bool is_scaled_dft = (flags & DFT_SCALE) != 0; const bool is_inverse = (flags & DFT_INVERSE) != 0; const bool is_complex_input = src.channels() == 2; const bool is_complex_output = !(flags & DFT_REAL_OUTPUT); // We don't support real-to-real transform CV_Assert( is_complex_input || is_complex_output ); GpuMat src_cont = src; // Make sure here we work with the continuous input, // as CUFFT can't handle gaps createContinuous(src.rows, src.cols, src.type(), src_cont); if (src_cont.data != src.data) src.copyTo(src_cont, stream); Size dft_size_opt = dft_size; if (is_1d_input && !is_row_dft) { // If the source matrix is single column handle it as single row dft_size_opt.width = std::max(dft_size.width, dft_size.height); dft_size_opt.height = std::min(dft_size.width, dft_size.height); } CV_Assert( dft_size_opt.width > 1 ); cufftType dft_type = CUFFT_R2C; if (is_complex_input) dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; cufftHandle plan; if (is_1d_input || is_row_dft) cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) ); else cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) ); cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); if (is_complex_input) { if (is_complex_output) { createContinuous(dft_size, CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2C( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(), is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); } else { createContinuous(dft_size, CV_32F, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2R( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>())); } } else { // We could swap dft_size for efficiency. Here we must reflect it if (dft_size == dft_size_opt) createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst); else createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecR2C( plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>())); } cufftSafeCall( cufftDestroy(plan) ); if (is_scaled_dft) cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream); #endif }
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) { CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA); CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0)); if (dsize == Size()) dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy)); else { fx = static_cast<double>(dsize.width) / src.cols; fy = static_cast<double>(dsize.height) / src.rows; } if (dsize != dst.size()) dst.create(dsize, src.type()); if (dsize == src.size()) { if (s) s.enqueueCopy(src, dst); else src.copyTo(dst); return; } cudaStream_t stream = StreamAccessor::getStream(s); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4); useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || (src.type() == CV_8UC4 && interpolation != INTER_AREA)); if (useNpp) { typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize, double xFactor, double yFactor, int eInterpolation); const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R }; static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS}; NppiSize srcsz; srcsz.width = wholeSize.width; srcsz.height = wholeSize.height; NppiRect srcrect; srcrect.x = ofs.x; srcrect.y = ofs.y; srcrect.width = src.cols; srcrect.height = src.rows; NppiSize dstsz; dstsz.width = dst.cols; dstsz.height = dst.rows; NppStreamHandler h(stream); nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else { using namespace ::cv::gpu::device::imgproc; typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { {resize_gpu<uchar> , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3> , resize_gpu<uchar4> }, {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/ , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/}, {resize_gpu<ushort> , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3> , resize_gpu<ushort4> }, {resize_gpu<short> , 0 /*resize_gpu<short2>*/ , resize_gpu<short3> , resize_gpu<short4> }, {0 /*resize_gpu<int>*/ , 0 /*resize_gpu<int2>*/ , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ }, {resize_gpu<float> , 0 /*resize_gpu<float2>*/ , resize_gpu<float3> , resize_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream); } }
inline void Stream::enqueueCopy(const GpuMat& src, OutputArray dst) { src.copyTo(dst, *this); }