void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream) { if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1) { GpuMat src1 = _src1.getGpuMat(); GpuMat src2 = _src2.getGpuMat(); CV_Assert( src1.size() == src2.size() ); _dst.create(src1.size(), src1.type()); GpuMat dst = _dst.getGpuMat(); divMat_8uc4_32f(src1, src2, dst, stream); } else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) { GpuMat src1 = _src1.getGpuMat(); GpuMat src2 = _src2.getGpuMat(); CV_Assert( src1.size() == src2.size() ); _dst.create(src1.size(), src1.type()); GpuMat dst = _dst.getGpuMat(); divMat_16sc4_32f(src1, src2, dst, stream); } else { arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, divMat, divScalar); } }
GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream) { GpuMat dst; #ifndef HAVE_CUDA (void) _dst; (void) rows; (void) cols; (void) type; (void) stream; throw_no_cuda(); #else if (_dst.kind() == _InputArray::CUDA_GPU_MAT) { _dst.create(rows, cols, type); dst = _dst.getGpuMat(); } else { BufferPool pool(stream); dst = pool.getBuffer(rows, cols, type); } #endif return dst; }
void cv::cuda::fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int search_window, int block_window, Stream& stream) { const GpuMat src = _src.getGpuMat(); CV_Assert(src.depth() == CV_8U && src.channels() < 4); int border_size = search_window/2 + block_window/2; Size esize = src.size() + Size(border_size, border_size) * 2; BufferPool pool(stream); GpuMat extended_src = pool.getBuffer(esize, src.type()); cv::cuda::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream); GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size())); int bcols, brows; device::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows); GpuMat buffer = pool.getBuffer(brows, bcols, CV_32S); using namespace cv::cuda::device::imgproc; typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0}; _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(stream)); }
void cv::cuda::rotate(InputArray _src, OutputArray _dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream) { typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call}, {0,0,0,0}, {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call}, {0,0,0,0}, {0,0,0,0}, {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call} }; GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F ); CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); dst.setTo(Scalar::all(0), stream); funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream)); }
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream) { #ifndef HAVE_CUFFT (void) _src1; (void) _src2; (void) _dst; (void) flags; (void) scale; (void) conjB; (void) stream; throw_no_cuda(); #else (void)flags; typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream); static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ }; GpuMat src1 = _src1.getGpuMat(); GpuMat src2 = _src2.getGpuMat(); CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2); CV_Assert( src1.size() == src2.size() ); _dst.create(src1.size(), CV_32FC2); GpuMat dst = _dst.getGpuMat(); Caller caller = callers[(int)conjB]; caller(src1, src2, scale, dst, StreamAccessor::getStream(stream)); #endif }
void cv::cuda::bilateralFilter(InputArray _src, OutputArray _dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& stream) { using cv::cuda::device::imgproc::bilateral_filter_gpu; typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s); static const func_t funcs[6][4] = { {bilateral_filter_gpu<uchar> , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3> , bilateral_filter_gpu<uchar4> }, {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/}, {bilateral_filter_gpu<ushort> , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3> , bilateral_filter_gpu<ushort4> }, {bilateral_filter_gpu<short> , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3> , bilateral_filter_gpu<short4> }, {0 /*bilateral_filter_gpu<int>*/ , 0 /*bilateral_filter_gpu<int2>*/ , 0 /*bilateral_filter_gpu<int3>*/ , 0 /*bilateral_filter_gpu<int4>*/ }, {bilateral_filter_gpu<float> , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3> , bilateral_filter_gpu<float4> } }; sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color; sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial; int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2; kernel_size = std::max(radius, 1)*2 + 1; GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP ); const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert( func != 0 ); _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); func(src, dst, kernel_size, sigma_spatial, sigma_color, borderMode, StreamAccessor::getStream(stream)); }
void cv::gpu::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_8UC1 ); _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); int intBufSize; nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) ); size_t bufSize = intBufSize + 2 * 256 * sizeof(int); ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, _buf); GpuMat buf = _buf.getGpuMat(); GpuMat hist(1, 256, CV_32SC1, buf.data); GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int)); GpuMat intBuf(1, intBufSize, CV_8UC1, buf.data + 2 * 256 * sizeof(int)); gpu::calcHist(src, hist, _stream); cudaStream_t stream = StreamAccessor::getStream(_stream); NppStreamHandler h(stream); nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) ); hist::equalizeHist(src, dst, lut.ptr<int>(), stream); }
void cv::gpu::pyrDown(InputArray _src, OutputArray _dst, Stream& stream) { using namespace cv::gpu::cudev::imgproc; typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[6][4] = { {pyrDown_gpu<uchar> , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3> , pyrDown_gpu<uchar4> }, {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/}, {pyrDown_gpu<ushort> , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3> , pyrDown_gpu<ushort4> }, {pyrDown_gpu<short> , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3> , pyrDown_gpu<short4> }, {0 /*pyrDown_gpu<int>*/ , 0 /*pyrDown_gpu<int2>*/ , 0 /*pyrDown_gpu<int3>*/ , 0 /*pyrDown_gpu<int4>*/ }, {pyrDown_gpu<float> , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3> , pyrDown_gpu<float4> } }; GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert( func != 0 ); _dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); GpuMat dst = _dst.getGpuMat(); func(src, dst, StreamAccessor::getStream(stream)); }
void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream) { GpuMat src = _src.getGpuMat(); _dst.create(src.size(), CV_32FC1); GpuMat dst = _dst.getGpuMat(); npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream)); }
void cv::gpu::calcHist(InputArray _src, OutputArray _hist, Stream& stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_8UC1 ); _hist.create(1, 256, CV_32SC1); GpuMat hist = _hist.getGpuMat(); hist.setTo(Scalar::all(0), stream); hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream)); }
void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::gpu::Stream& s) const { CV_Assert(fields); // only color images and precomputed integrals are supported int type = _image.type(); CV_Assert(type == CV_8UC3 || type == CV_32SC1 || (!_rois.empty())); const cv::gpu::GpuMat image = _image.getGpuMat(); if (_objects.empty()) _objects.create(1, 4096 * sizeof(Detection), CV_8UC1); cv::gpu::GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); /// roi Fields& flds = *fields; int shr = flds.shrinkage; flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type()); device::shrink(rois, flds.mask); //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s); if (type == CV_8UC3) { flds.update(image.rows, image.cols, flds.shrinkage); if (flds.check((float)minScale, (float)maxScale, scales)) flds.createLevels(image.rows, image.cols); flds.preprocessor->apply(image, flds.shrunk); integral(flds.shrunk, flds.hogluv, flds.integralBuffer, s); } else { if (s) s.enqueueCopy(image, flds.hogluv); else image.copyTo(flds.hogluv); } flds.detect(objects, s); if ( (flags && NMS_MASK) != NO_REJECT) { cv::gpu::GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows)); flds.suppress(objects, s); flds.suppressed.copyTo(spr); } }
void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream) { GpuMat src = _src.getGpuMat(); typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { {device::resize<uchar> , 0 /*device::resize<uchar2>*/ , device::resize<uchar3> , device::resize<uchar4> }, {0 /*device::resize<schar>*/, 0 /*device::resize<char2>*/ , 0 /*device::resize<char3>*/, 0 /*device::resize<char4>*/}, {device::resize<ushort> , 0 /*device::resize<ushort2>*/, device::resize<ushort3> , device::resize<ushort4> }, {device::resize<short> , 0 /*device::resize<short2>*/ , device::resize<short3> , device::resize<short4> }, {0 /*device::resize<int>*/ , 0 /*device::resize<int2>*/ , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ }, {device::resize<float> , 0 /*device::resize<float2>*/ , device::resize<float3> , device::resize<float4> } }; CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA ); CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) ); if (dsize == Size()) { dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy)); } else { fx = static_cast<double>(dsize.width) / src.cols; fy = static_cast<double>(dsize.height) / src.rows; } _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); if (dsize == src.size()) { src.copyTo(dst, stream); return; } const func_t func = funcs[src.depth()][src.channels() - 1]; if (!func) CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); PtrStepSzb wholeSrc(wholeSize.height, wholeSize.width, src.datastart, src.step); func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream)); }
void cv::ogl::Buffer::copyTo(OutputArray arr, cuda::Stream& stream) const { #ifndef HAVE_OPENGL (void) arr; (void) stream; throw_no_ogl(); #else #ifndef HAVE_CUDA (void) arr; (void) stream; throw_no_cuda(); #else arr.create(rows_, cols_, type_); GpuMat dmat = arr.getGpuMat(); impl_->copyTo(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, cuda::StreamAccessor::getStream(stream)); #endif #endif }
void cv::cuda::remap(InputArray _src, OutputArray _dst, InputArray _xmap, InputArray _ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream) { using namespace cv::cuda::device::imgproc; typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); static const func_t funcs[6][4] = { {remap_gpu<uchar> , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3> , remap_gpu<uchar4> }, {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/ , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/}, {remap_gpu<ushort> , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3> , remap_gpu<ushort4> }, {remap_gpu<short> , 0 /*remap_gpu<short2>*/ , remap_gpu<short3> , remap_gpu<short4> }, {0 /*remap_gpu<int>*/ , 0 /*remap_gpu<int2>*/ , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ }, {remap_gpu<float> , 0 /*remap_gpu<float2>*/ , remap_gpu<float3> , remap_gpu<float4> } }; GpuMat src = _src.getGpuMat(); GpuMat xmap = _xmap.getGpuMat(); GpuMat ymap = _ymap.getGpuMat(); CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size() ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP ); const func_t func = funcs[src.depth()][src.channels() - 1]; if (!func) CV_Error(Error::StsUnsupportedFormat, "Unsupported input type"); _dst.create(xmap.size(), src.type()); GpuMat dst = _dst.getGpuMat(); Scalar_<float> borderValueFloat; borderValueFloat = borderValue; Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap, dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20)); }
void cv::cuda::nonLocalMeans(InputArray _src, OutputArray _dst, float h, int search_window, int block_window, int borderMode, Stream& stream) { using cv::cuda::device::imgproc::nlm_bruteforce_gpu; typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream); static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ }; const GpuMat src = _src.getGpuMat(); CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3); const func_t func = funcs[src.channels() - 1]; CV_Assert(func != 0); int b = borderMode; CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP); _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); func(src, dst, search_window/2, block_window/2, h, borderMode, StreamAccessor::getStream(stream)); }
void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream) { typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream); static const func_t funcs[5][4] = { {NppShift<CV_8U , 1, nppiLShiftC_8u_C1R>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R>::call }, {0 , 0, 0 , 0 }, {NppShift<CV_16U, 1, nppiLShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiLShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiLShiftC_16u_C4R>::call}, {0 , 0, 0 , 0 }, {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call}, }; GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S ); CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream)); }
void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream) { typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream); static const func_t funcs[6][4] = { {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call}, {0,0,0,0}, {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call}, {0,0,0,0}, {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call}, {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call} }; GpuMat src = _src.getGpuMat(); CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F); CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream)); }
void cv::gpu::transpose(InputArray _src, OutputArray _dst, Stream& _stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 ); _dst.create( src.cols, src.rows, src.type() ); GpuMat dst = _dst.getGpuMat(); cudaStream_t stream = StreamAccessor::getStream(_stream); if (src.elemSize() == 1) { NppStreamHandler h(stream); NppiSize sz; sz.width = src.cols; sz.height = src.rows; nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else if (src.elemSize() == 4) { arithm::transpose<int>(src, dst, stream); } else // if (src.elemSize() == 8) { if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); arithm::transpose<double>(src, dst, stream); } }
void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream) { #ifndef HAVE_CUBLAS (void) _src1; (void) _src2; (void) alpha; (void) _src3; (void) beta; (void) _dst; (void) flags; (void) stream; CV_Error(Error::StsNotImplemented, "The library was build without CUBLAS"); #else // CUBLAS works with column-major matrices GpuMat src1 = _src1.getGpuMat(); GpuMat src2 = _src2.getGpuMat(); GpuMat src3 = _src3.getGpuMat(); CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 ); CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) ); if (src1.depth() == CV_64F) { if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); } bool tr1 = (flags & GEMM_1_T) != 0; bool tr2 = (flags & GEMM_2_T) != 0; bool tr3 = (flags & GEMM_3_T) != 0; if (src1.type() == CV_64FC2) { if (tr1 || tr2 || tr3) CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type"); } Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size(); Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size(); Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size(); Size dstSize(src2Size.width, src1Size.height); CV_Assert( src1Size.width == src2Size.height ); CV_Assert( src3.empty() || src3Size == dstSize ); _dst.create(dstSize, src1.type()); GpuMat dst = _dst.getGpuMat(); if (beta != 0) { if (src3.empty()) { dst.setTo(Scalar::all(0), stream); } else { if (tr3) { cuda::transpose(src3, dst, stream); } else { src3.copyTo(dst, stream); } } } cublasHandle_t handle; cublasSafeCall( cublasCreate_v2(&handle) ); cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) ); cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) ); const float alphaf = static_cast<float>(alpha); const float betaf = static_cast<float>(beta); const cuComplex alphacf = make_cuComplex(alphaf, 0); const cuComplex betacf = make_cuComplex(betaf, 0); const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0); const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0); cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N; switch (src1.type()) { case CV_32FC1: cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphaf, src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)), src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)), &betaf, dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) ); break; case CV_64FC1: cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alpha, src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)), src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)), &beta, dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) ); break; case CV_32FC2: cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphacf, src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)), src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)), &betacf, dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) ); break; case CV_64FC2: cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphac, src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)), src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)), &betac, dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) ); break; } cublasSafeCall( cublasDestroy_v2(handle) ); #endif }
void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream) { #ifndef HAVE_CUFFT (void) _src; (void) _dst; (void) dft_size; (void) flags; (void) stream; throw_no_cuda(); #else GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); // We don't support unpacked output (in the case of real input) CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) ); const bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); const bool is_row_dft = (flags & DFT_ROWS) != 0; const bool is_scaled_dft = (flags & DFT_SCALE) != 0; const bool is_inverse = (flags & DFT_INVERSE) != 0; const bool is_complex_input = src.channels() == 2; const bool is_complex_output = !(flags & DFT_REAL_OUTPUT); // We don't support real-to-real transform CV_Assert( is_complex_input || is_complex_output ); GpuMat src_cont = src; // Make sure here we work with the continuous input, // as CUFFT can't handle gaps createContinuous(src.rows, src.cols, src.type(), src_cont); if (src_cont.data != src.data) src.copyTo(src_cont, stream); Size dft_size_opt = dft_size; if (is_1d_input && !is_row_dft) { // If the source matrix is single column handle it as single row dft_size_opt.width = std::max(dft_size.width, dft_size.height); dft_size_opt.height = std::min(dft_size.width, dft_size.height); } CV_Assert( dft_size_opt.width > 1 ); cufftType dft_type = CUFFT_R2C; if (is_complex_input) dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; cufftHandle plan; if (is_1d_input || is_row_dft) cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) ); else cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) ); cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); if (is_complex_input) { if (is_complex_output) { createContinuous(dft_size, CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2C( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(), is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); } else { createContinuous(dft_size, CV_32F, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2R( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>())); } } else { // We could swap dft_size for efficiency. Here we must reflect it if (dft_size == dft_size_opt) createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst); else createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecR2C( plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>())); } cufftSafeCall( cufftDestroy(plan) ); if (is_scaled_dft) cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream); #endif }
void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream) { GpuMat src = _src.getGpuMat(); Mat M = _M.getMat(); CV_Assert( M.rows == 3 && M.cols == 3 ); const int interpolation = flags & INTER_MAX; CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ; _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); static const bool useNppTab[6][4][3] = { { {false, false, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} } }; bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation]; // NPP bug on float data useNpp = useNpp && src.depth() != CV_32F; if (useNpp) { typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream); static const func_t funcs[2][6][4] = { { {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call} }, { {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call} } }; dst.setTo(borderValue, stream); double coeffs[3][3]; Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream)); } else { using namespace cv::cuda::device::imgproc; typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); static const func_t funcs[6][4] = { {warpPerspective_gpu<uchar> , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3> , warpPerspective_gpu<uchar4> }, {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/ , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/}, {warpPerspective_gpu<ushort> , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3> , warpPerspective_gpu<ushort4> }, {warpPerspective_gpu<short> , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3> , warpPerspective_gpu<short4> }, {0 /*warpPerspective_gpu<int>*/ , 0 /*warpPerspective_gpu<int2>*/ , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ }, {warpPerspective_gpu<float> , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3> , warpPerspective_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); float coeffs[3 * 3]; Mat coeffsMat(3, 3, CV_32F, (void*)coeffs); if (flags & WARP_INVERSE_MAP) M.convertTo(coeffsMat, coeffsMat.type()); else { cv::Mat iM; invert(M, iM); iM.convertTo(coeffsMat, coeffsMat.type()); } Scalar_<float> borderValueFloat; borderValueFloat = borderValue; func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs, dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20)); } }
void cv::gpu::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP ); _dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); GpuMat dst = _dst.getGpuMat(); cudaStream_t stream = StreamAccessor::getStream(_stream); if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1)) { NppiSize srcsz; srcsz.width = src.cols; srcsz.height = src.rows; NppiSize dstsz; dstsz.width = dst.cols; dstsz.height = dst.rows; NppStreamHandler h(stream); switch (src.type()) { case CV_8UC1: { Npp8u nVal = saturate_cast<Npp8u>(value[0]); nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_8UC4: { Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])}; nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_32SC1: { Npp32s nVal = saturate_cast<Npp32s>(value[0]); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_32FC1: { Npp32f val = saturate_cast<Npp32f>(value[0]); Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val)); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } } if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else { typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream); static const caller_t callers[6][4] = { { copyMakeBorder_caller<uchar, 1> , copyMakeBorder_caller<uchar, 2> , copyMakeBorder_caller<uchar, 3> , copyMakeBorder_caller<uchar, 4>}, {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/}, { copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/, copyMakeBorder_caller<ushort, 3> , copyMakeBorder_caller<ushort, 4>}, { copyMakeBorder_caller<short, 1> , 0/*copyMakeBorder_caller<short, 2>*/ , copyMakeBorder_caller<short, 3> , copyMakeBorder_caller<short, 4>}, {0/*copyMakeBorder_caller<int, 1>*/, 0/*copyMakeBorder_caller<int, 2>*/ , 0/*copyMakeBorder_caller<int, 3>*/, 0/*copyMakeBorder_caller<int , 4>*/}, { copyMakeBorder_caller<float, 1> , 0/*copyMakeBorder_caller<float, 2>*/ , copyMakeBorder_caller<float, 3> , copyMakeBorder_caller<float ,4>} }; caller_t func = callers[src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, dst, top, left, borderType, value, stream); } }
void cv::cuda::buildWarpAffineMaps(InputArray _M, bool inverse, Size dsize, OutputArray _xmap, OutputArray _ymap, Stream& stream) { using namespace cv::cuda::device::imgproc; Mat M = _M.getMat(); CV_Assert( M.rows == 2 && M.cols == 3 ); _xmap.create(dsize, CV_32FC1); _ymap.create(dsize, CV_32FC1); GpuMat xmap = _xmap.getGpuMat(); GpuMat ymap = _ymap.getGpuMat(); float coeffs[2 * 3]; Mat coeffsMat(2, 3, CV_32F, (void*)coeffs); if (inverse) M.convertTo(coeffsMat, coeffsMat.type()); else { cv::Mat iM; invertAffineTransform(M, iM); iM.convertTo(coeffsMat, coeffsMat.type()); } buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream)); }