void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err) { if (prevPts.empty()) { nextPts.release(); status.release(); if (err) err->release(); return; } dim3 block, patch; calcPatchSize(winSize, block, patch); CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4); CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); CV_Assert(maxLevel >= 0); CV_Assert(winSize.width > 2 && winSize.height > 2); CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6); CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2); if (useInitialFlow) CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2); else ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); GpuMat temp2 = nextPts.reshape(1); multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); status.setTo(Scalar::all(1)); if (err) ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); // build the image pyramids. prevPyr_.resize(maxLevel + 1); nextPyr_.resize(maxLevel + 1); int cn = prevImg.channels(); if (cn == 1 || cn == 4) { prevImg.convertTo(prevPyr_[0], CV_32F); nextImg.convertTo(nextPyr_[0], CV_32F); } else { buf_.resize(1); cvtColor(prevImg, buf_[0], COLOR_BGR2BGRA); buf_[0].convertTo(prevPyr_[0], CV_32F); cvtColor(nextImg, buf_[0], COLOR_BGR2BGRA); buf_[0].convertTo(nextPyr_[0], CV_32F); } for (int level = 1; level <= maxLevel; ++level) { pyrDown(prevPyr_[level - 1], prevPyr_[level]); pyrDown(nextPyr_[level - 1], nextPyr_[level]); } pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters); for (int level = maxLevel; level >= 0; level--) { if (cn == 1) { pyrlk::sparse1(prevPyr_[level], nextPyr_[level], prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols, level, block, patch); } else { pyrlk::sparse4(prevPyr_[level], nextPyr_[level], prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols, level, block, patch); } } }
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream) { using namespace ::cv::gpu::device::matrix_reductions; CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F); CV_Assert(dim == 0 || dim == 1); CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN); if (dtype < 0) dtype = src.depth(); dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels())); if (dim == 0) { typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); static const caller_t callers[6][6] = { { reduceRows_gpu<unsigned char, int, unsigned char>, 0/*reduceRows_gpu<unsigned char, int, signed char>*/, 0/*reduceRows_gpu<unsigned char, int, unsigned short>*/, 0/*reduceRows_gpu<unsigned char, int, short>*/, reduceRows_gpu<unsigned char, int, int>, reduceRows_gpu<unsigned char, int, float> }, { 0/*reduceRows_gpu<signed char, int, unsigned char>*/, 0/*reduceRows_gpu<signed char, int, signed char>*/, 0/*reduceRows_gpu<signed char, int, unsigned short>*/, 0/*reduceRows_gpu<signed char, int, short>*/, 0/*reduceRows_gpu<signed char, int, int>*/, 0/*reduceRows_gpu<signed char, int, float>*/ }, { 0/*reduceRows_gpu<unsigned short, int, unsigned char>*/, 0/*reduceRows_gpu<unsigned short, int, signed char>*/, reduceRows_gpu<unsigned short, int, unsigned short>, 0/*reduceRows_gpu<unsigned short, int, short>*/, reduceRows_gpu<unsigned short, int, int>, reduceRows_gpu<unsigned short, int, float> }, { 0/*reduceRows_gpu<short, int, unsigned char>*/, 0/*reduceRows_gpu<short, int, signed char>*/, 0/*reduceRows_gpu<short, int, unsigned short>*/, reduceRows_gpu<short, int, short>, reduceRows_gpu<short, int, int>, reduceRows_gpu<short, int, float> }, { 0/*reduceRows_gpu<int, int, unsigned char>*/, 0/*reduceRows_gpu<int, int, signed char>*/, 0/*reduceRows_gpu<int, int, unsigned short>*/, 0/*reduceRows_gpu<int, int, short>*/, reduceRows_gpu<int, int, int>, reduceRows_gpu<int, int, float> }, { 0/*reduceRows_gpu<float, float, unsigned char>*/, 0/*reduceRows_gpu<float, float, signed char>*/, 0/*reduceRows_gpu<float, float, unsigned short>*/, 0/*reduceRows_gpu<float, float, short>*/, 0/*reduceRows_gpu<float, float, int>*/, reduceRows_gpu<float, float, float> } }; const caller_t func = callers[src.depth()][dst.depth()]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats"); func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream)); } else { typedef void (*caller_t)(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); static const caller_t callers[6][6] = { { reduceCols_gpu<unsigned char, int, unsigned char>, 0/*reduceCols_gpu<unsigned char, int, signed char>*/, 0/*reduceCols_gpu<unsigned char, int, unsigned short>*/, 0/*reduceCols_gpu<unsigned char, int, short>*/, reduceCols_gpu<unsigned char, int, int>, reduceCols_gpu<unsigned char, int, float> }, { 0/*reduceCols_gpu<signed char, int, unsigned char>*/, 0/*reduceCols_gpu<signed char, int, signed char>*/, 0/*reduceCols_gpu<signed char, int, unsigned short>*/, 0/*reduceCols_gpu<signed char, int, short>*/, 0/*reduceCols_gpu<signed char, int, int>*/, 0/*reduceCols_gpu<signed char, int, float>*/ }, { 0/*reduceCols_gpu<unsigned short, int, unsigned char>*/, 0/*reduceCols_gpu<unsigned short, int, signed char>*/, reduceCols_gpu<unsigned short, int, unsigned short>, 0/*reduceCols_gpu<unsigned short, int, short>*/, reduceCols_gpu<unsigned short, int, int>, reduceCols_gpu<unsigned short, int, float> }, { 0/*reduceCols_gpu<short, int, unsigned char>*/, 0/*reduceCols_gpu<short, int, signed char>*/, 0/*reduceCols_gpu<short, int, unsigned short>*/, reduceCols_gpu<short, int, short>, reduceCols_gpu<short, int, int>, reduceCols_gpu<short, int, float> }, { 0/*reduceCols_gpu<int, int, unsigned char>*/, 0/*reduceCols_gpu<int, int, signed char>*/, 0/*reduceCols_gpu<int, int, unsigned short>*/, 0/*reduceCols_gpu<int, int, short>*/, reduceCols_gpu<int, int, int>, reduceCols_gpu<int, int, float> }, { 0/*reduceCols_gpu<float, unsigned char>*/, 0/*reduceCols_gpu<float, signed char>*/, 0/*reduceCols_gpu<float, unsigned short>*/, 0/*reduceCols_gpu<float, short>*/, 0/*reduceCols_gpu<float, int>*/, reduceCols_gpu<float, float, float> } }; const caller_t func = callers[src.depth()][dst.depth()]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats"); func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream)); } }
void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream) { dst.resize(src.channels()); if(src.channels() > 0) split_merge::split(src, &dst[0], StreamAccessor::getStream(stream)); }
void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s) { CV_Assert(M.rows == 3 && M.cols == 3); int interpolation = flags & INTER_MAX; CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC); CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); static const bool useNppTab[6][4][3] = { { {false, false, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} } }; bool useNpp = borderMode == BORDER_CONSTANT; useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation]; #ifdef linux // NPP bug on float data useNpp = useNpp && src.depth() != CV_32F; #endif if (useNpp) { typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream); static const func_t funcs[2][6][4] = { { {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call} }, { {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call} } }; double coeffs[3][3]; Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s)); } else { using namespace cv::gpu::device::imgproc; typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); static const func_t funcs[6][4] = { {warpPerspective_gpu<uchar> , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3> , warpPerspective_gpu<uchar4> }, {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/ , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/}, {warpPerspective_gpu<ushort> , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3> , warpPerspective_gpu<ushort4> }, {warpPerspective_gpu<short> , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3> , warpPerspective_gpu<short4> }, {0 /*warpPerspective_gpu<int>*/ , 0 /*warpPerspective_gpu<int2>*/ , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ }, {warpPerspective_gpu<float> , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3> , warpPerspective_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType)); dst.create(dsize, src.type()); float coeffs[3 * 3]; Mat coeffsMat(3, 3, CV_32F, (void*)coeffs); if (flags & WARP_INVERSE_MAP) M.convertTo(coeffsMat, coeffsMat.type()); else { cv::Mat iM; invert(M, iM); iM.convertTo(coeffsMat, coeffsMat.type()); } Scalar_<float> borderValueFloat; borderValueFloat = borderValue; DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc); } }
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf) { using namespace ::cv::gpu::device::matrix_reductions::minmaxloc; typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); static Caller multipass_callers[] = { minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 }; static Caller singlepass_callers[] = { minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> }; static MaskedCaller masked_multipass_callers[] = { minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>, minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 }; static MaskedCaller masked_singlepass_callers[] = { minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double> }; CV_Assert(src.depth() <= CV_64F); CV_Assert(src.channels() == 1); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); if (src.depth() == CV_64F) { if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } double minVal_; if (!minVal) minVal = &minVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_; int minLoc_[2]; int maxLoc_[2]; Size valbuf_size, locbuf_size; getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height); ensureSizeIsEnough(valbuf_size, CV_8U, valBuf); ensureSizeIsEnough(locbuf_size, CV_8U, locBuf); if (mask.empty()) { Caller* callers = multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = singlepass_callers; Caller caller = callers[src.type()]; CV_Assert(caller != 0); caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); } else { MaskedCaller* callers = masked_multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = masked_singlepass_callers; MaskedCaller caller = callers[src.type()]; CV_Assert(caller != 0); caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); } if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; } if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; } }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const vector<GpuMat>& masks, Stream& stream) { if (query.empty() || empty()) return; using namespace ::cv::gpu::device::bf_radius_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); static const caller_t callers[3][6] = { { matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, matchL1_gpu<unsigned short>, matchL1_gpu<short>, matchL1_gpu<int>, matchL1_gpu<float> }, { 0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 0/*matchL2_gpu<int>*/, matchL2_gpu<float> }, { matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ } }; DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS)); const int nQuery = query.rows; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size())); ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx); ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx); ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance); } if (stream) stream.enqueueMemSet(nMatches, Scalar::all(0)); else nMatches.setTo(Scalar::all(0)); caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); vector<DevMem2Db> trains_(trainDescCollection.begin(), trainDescCollection.end()); vector<DevMem2Db> masks_(masks.begin(), masks.end()); func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); }
void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask) { matrix_operations::set_to_with_mask(src, src.depth(), val.val, mask, src.channels(), impl->stream); }
void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const vector<GpuMat>& masks, Stream& stream) { if (query.empty() || empty()) return; using namespace cv::gpu::device::bf_radius_match; typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); static const caller_t callersL1[] = { matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, matchL1_gpu<unsigned short>, matchL1_gpu<short>, matchL1_gpu<int>, matchL1_gpu<float> }; static const caller_t callersL2[] = { 0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 0/*matchL2_gpu<int>*/, matchL2_gpu<float> }; static const caller_t callersHamming[] = { matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ }; DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); const int nQuery = query.rows; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size())); CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING); const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming; ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx); ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx); ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance); } if (stream) stream.enqueueMemSet(nMatches, Scalar::all(0)); else nMatches.setTo(Scalar::all(0)); caller_t func = callers[query.depth()]; CV_Assert(func != 0); vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end()); vector<PtrStepSzb> masks_(masks.begin(), masks.end()); func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); }
void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream) { #ifndef HAVE_CUFFT (void) _src; (void) _dst; (void) dft_size; (void) flags; (void) stream; throw_no_cuda(); #else GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); // We don't support unpacked output (in the case of real input) CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) ); const bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); const bool is_row_dft = (flags & DFT_ROWS) != 0; const bool is_scaled_dft = (flags & DFT_SCALE) != 0; const bool is_inverse = (flags & DFT_INVERSE) != 0; const bool is_complex_input = src.channels() == 2; const bool is_complex_output = !(flags & DFT_REAL_OUTPUT); // We don't support real-to-real transform CV_Assert( is_complex_input || is_complex_output ); GpuMat src_cont = src; // Make sure here we work with the continuous input, // as CUFFT can't handle gaps createContinuous(src.rows, src.cols, src.type(), src_cont); if (src_cont.data != src.data) src.copyTo(src_cont, stream); Size dft_size_opt = dft_size; if (is_1d_input && !is_row_dft) { // If the source matrix is single column handle it as single row dft_size_opt.width = std::max(dft_size.width, dft_size.height); dft_size_opt.height = std::min(dft_size.width, dft_size.height); } CV_Assert( dft_size_opt.width > 1 ); cufftType dft_type = CUFFT_R2C; if (is_complex_input) dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; cufftHandle plan; if (is_1d_input || is_row_dft) cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) ); else cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) ); cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); if (is_complex_input) { if (is_complex_output) { createContinuous(dft_size, CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2C( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(), is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); } else { createContinuous(dft_size, CV_32F, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2R( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>())); } } else { // We could swap dft_size for efficiency. Here we must reflect it if (dft_size == dft_size_opt) createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst); else createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecR2C( plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>())); } cufftSafeCall( cufftDestroy(plan) ); if (is_scaled_dft) cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream); #endif }
void cv::gpu::FarnebackOpticalFlow::operator ()( const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s) { CV_Assert(frame0.channels() == 1 && frame1.channels() == 1); CV_Assert(frame0.size() == frame1.size()); CV_Assert(polyN == 5 || polyN == 7); CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6); Stream streams[5]; if (S(s)) streams[0] = s; Size size = frame0.size(); GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY; flowx.create(size, CV_32F); flowy.create(size, CV_32F); GpuMat flowx0 = flowx; GpuMat flowy0 = flowy; // Crop unnecessary levels double scale = 1; int numLevelsCropped = 0; for (; numLevelsCropped < numLevels; numLevelsCropped++) { scale *= pyrScale; if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE) break; } streams[0].enqueueConvert(frame0, frames_[0], CV_32F); streams[1].enqueueConvert(frame1, frames_[1], CV_32F); if (fastPyramids) { // Build Gaussian pyramids using pyrDown() pyramid0_.resize(numLevelsCropped + 1); pyramid1_.resize(numLevelsCropped + 1); pyramid0_[0] = frames_[0]; pyramid1_[0] = frames_[1]; for (int i = 1; i <= numLevelsCropped; ++i) { pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]); pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]); } } setPolynomialExpansionConsts(polyN, polySigma); device::optflow_farneback::setUpdateMatricesConsts(); for (int k = numLevelsCropped; k >= 0; k--) { streams[0].waitForCompletion(); scale = 1; for (int i = 0; i < k; i++) scale *= pyrScale; double sigma = (1./scale - 1) * 0.5; int smoothSize = cvRound(sigma*5) | 1; smoothSize = std::max(smoothSize, 3); int width = cvRound(size.width*scale); int height = cvRound(size.height*scale); if (fastPyramids) { width = pyramid0_[k].cols; height = pyramid0_[k].rows; } if (k > 0) { curFlowX.create(height, width, CV_32F); curFlowY.create(height, width, CV_32F); } else { curFlowX = flowx0; curFlowY = flowy0; } if (!prevFlowX.data) { if (flags & OPTFLOW_USE_INITIAL_FLOW) { #if ENABLE_GPU_RESIZE resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale); streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale); #else Mat tmp1, tmp2; flowx0.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA); tmp2 *= scale; curFlowX.upload(tmp2); flowy0.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA); tmp2 *= scale; curFlowY.upload(tmp2); #endif } else { streams[0].enqueueMemSet(curFlowX, 0); streams[1].enqueueMemSet(curFlowY, 0); } } else { #if ENABLE_GPU_RESIZE resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale); streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale); #else Mat tmp1, tmp2; prevFlowX.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR); tmp2 *= 1./pyrScale; curFlowX.upload(tmp2); prevFlowY.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR); tmp2 *= 1./pyrScale; curFlowY.upload(tmp2); #endif } GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_); GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_); GpuMat R[2] = { allocMatFromBuf(5*height, width, CV_32F, R_[0]), allocMatFromBuf(5*height, width, CV_32F, R_[1]) }; if (fastPyramids) { device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0])); device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1])); } else { GpuMat blurredFrame[2] = { allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]), allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1]) }; GpuMat pyrLevel[2] = { allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]), allocMatFromBuf(height, width, CV_32F, pyrLevel_[1]) }; Mat g = getGaussianKernel(smoothSize, sigma, CV_32F); device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2); for (int i = 0; i < 2; i++) { device::optflow_farneback::gaussianBlurGpu( frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101_GPU, S(streams[i])); #if ENABLE_GPU_RESIZE resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]); #else Mat tmp1, tmp2; tmp[i].download(tmp1); resize(tmp1, tmp2, Size(width, height), INTER_LINEAR); I[i].upload(tmp2); #endif device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i])); } } streams[1].waitForCompletion(); device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0])); if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) { Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F); device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2); } for (int i = 0; i < numIters; i++) { if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); else updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); } prevFlowX = curFlowX; prevFlowY = curFlowY; } flowx = curFlowX; flowy = curFlowY; if (!S(s)) streams[0].waitForCompletion(); }
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4)); const Scalar zero = Scalar::all(0); cudaStream_t cudaStream = StreamAccessor::getStream(stream); //////////////////////////////////////////////////////////////////////////////////////////// // Init int rows = left.rows; int cols = left.cols; rthis.levels = std::min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); int levels = rthis.levels; // compute sizes AutoBuffer<int> buf(levels * 3); int* cols_pyr = buf; int* rows_pyr = cols_pyr + levels; int* nr_plane_pyr = rows_pyr + levels; cols_pyr[0] = cols; rows_pyr[0] = rows; nr_plane_pyr[0] = rthis.nr_plane; for (int i = 1; i < levels; i++) { cols_pyr[i] = cols_pyr[i-1] / 2; rows_pyr[i] = rows_pyr[i-1] / 2; nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2; } GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected; //allocate buffers int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2 buffers_count += 2; // data_cost has twice more rows than other buffers, what's why +2, not +1; buffers_count += 1; // data_cost_selected mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type); data_cost = mbuf.rowRange(0, rows * rthis.nr_plane * 2); data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane); for(int k = 0; k < 2; ++k) // in/out { GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows); GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2); GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] }; for(int _r = 0; _r < 5; ++_r) { *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5); CV_DbgAssert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane); } }; size_t elem_step = mbuf.step / sizeof(T); Size temp_size = data_cost.size(); if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp) temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp); temp.create(temp_size, DataType<T>::type); //////////////////////////////////////////////////////////////////////////// // Compute load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); if (stream) { stream.enqueueMemSet(l[0], zero); stream.enqueueMemSet(d[0], zero); stream.enqueueMemSet(r[0], zero); stream.enqueueMemSet(u[0], zero); stream.enqueueMemSet(l[1], zero); stream.enqueueMemSet(d[1], zero); stream.enqueueMemSet(r[1], zero); stream.enqueueMemSet(u[1], zero); stream.enqueueMemSet(data_cost, zero); stream.enqueueMemSet(data_cost_selected, zero); } else { l[0].setTo(zero); d[0].setTo(zero); r[0].setTo(zero); u[0].setTo(zero); l[1].setTo(zero); d[1].setTo(zero); r[1].setTo(zero); u[1].setTo(zero); data_cost.setTo(zero); data_cost_selected.setTo(zero); } int cur_idx = 0; for (int i = levels - 1; i >= 0; i--) { if (i == levels - 1) { init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream); } else { compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step, left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream); int new_idx = (cur_idx + 1) & 1; init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(), u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream); cur_idx = new_idx; } calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream); } if (disp.empty()) disp.create(rows, cols, CV_16S); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); if (stream) stream.enqueueMemSet(out, zero); else out.setTo(zero); compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream); if (disp.type() != CV_16S) { if (stream) stream.enqueueConvert(out, disp, disp.type()); else out.convertTo(disp, disp.type()); } }
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) { CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA); CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0)); if (dsize == Size()) dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy)); else { fx = static_cast<double>(dsize.width) / src.cols; fy = static_cast<double>(dsize.height) / src.rows; } if (dsize != dst.size()) dst.create(dsize, src.type()); if (dsize == src.size()) { if (s) s.enqueueCopy(src, dst); else src.copyTo(dst); return; } cudaStream_t stream = StreamAccessor::getStream(s); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4); useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || (src.type() == CV_8UC4 && interpolation != INTER_AREA)); if (useNpp) { typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize, double xFactor, double yFactor, int eInterpolation); const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R }; static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS}; NppiSize srcsz; srcsz.width = wholeSize.width; srcsz.height = wholeSize.height; NppiRect srcrect; srcrect.x = ofs.x; srcrect.y = ofs.y; srcrect.width = src.cols; srcrect.height = src.rows; NppiSize dstsz; dstsz.width = dst.cols; dstsz.height = dst.rows; NppStreamHandler h(stream); nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else { using namespace ::cv::gpu::device::imgproc; typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { {resize_gpu<uchar> , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3> , resize_gpu<uchar4> }, {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/ , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/}, {resize_gpu<ushort> , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3> , resize_gpu<ushort4> }, {resize_gpu<short> , 0 /*resize_gpu<short2>*/ , resize_gpu<short3> , resize_gpu<short4> }, {0 /*resize_gpu<int>*/ , 0 /*resize_gpu<int2>*/ , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ }, {resize_gpu<float> , 0 /*resize_gpu<float2>*/ , resize_gpu<float3> , resize_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream); } }
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) { using namespace mathfunc::minmax; typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); static Caller multipass_callers[7] = { minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 }; static Caller singlepass_callers[7] = { minMaxCaller<unsigned char>, minMaxCaller<char>, minMaxCaller<unsigned short>, minMaxCaller<short>, minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> }; static MaskedCaller masked_multipass_callers[7] = { minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>, minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 }; static MaskedCaller masked_singlepass_callers[7] = { minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> }; CV_Assert(src.channels() == 1); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); double minVal_; if (!minVal) minVal = &minVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_; Size buf_size; getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height); ensureSizeIsEnough(buf_size, CV_8U, buf); if (mask.empty()) { Caller* callers = multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = singlepass_callers; Caller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); caller(src, minVal, maxVal, buf); } else { MaskedCaller* callers = masked_multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = masked_singlepass_callers; MaskedCaller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); caller(src, mask, minVal, maxVal, buf); } }
double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf) { CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2); CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1)); GpuMat src_single_channel = src.reshape(1); if (normType == NORM_L1) return gpu::absSum(src_single_channel, mask, buf)[0]; if (normType == NORM_L2) return std::sqrt(gpu::sqrSum(src_single_channel, mask, buf)[0]); // NORM_INF double min_val, max_val; gpu::minMax(src_single_channel, &min_val, &max_val, mask, buf); return std::max(std::abs(min_val), std::abs(max_val)); }
void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream) { GpuMat src = _src.getGpuMat(); Mat M = _M.getMat(); CV_Assert( M.rows == 3 && M.cols == 3 ); const int interpolation = flags & INTER_MAX; CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ; _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); static const bool useNppTab[6][4][3] = { { {false, false, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} } }; bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation]; // NPP bug on float data useNpp = useNpp && src.depth() != CV_32F; if (useNpp) { typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream); static const func_t funcs[2][6][4] = { { {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call} }, { {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call} } }; dst.setTo(borderValue, stream); double coeffs[3][3]; Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream)); } else { using namespace cv::cuda::device::imgproc; typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); static const func_t funcs[6][4] = { {warpPerspective_gpu<uchar> , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3> , warpPerspective_gpu<uchar4> }, {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/ , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/}, {warpPerspective_gpu<ushort> , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3> , warpPerspective_gpu<ushort4> }, {warpPerspective_gpu<short> , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3> , warpPerspective_gpu<short4> }, {0 /*warpPerspective_gpu<int>*/ , 0 /*warpPerspective_gpu<int2>*/ , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ }, {warpPerspective_gpu<float> , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3> , warpPerspective_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); float coeffs[3 * 3]; Mat coeffsMat(3, 3, CV_32F, (void*)coeffs); if (flags & WARP_INVERSE_MAP) M.convertTo(coeffsMat, coeffsMat.type()); else { cv::Mat iM; invert(M, iM); iM.convertTo(coeffsMat, coeffsMat.type()); } Scalar_<float> borderValueFloat; borderValueFloat = borderValue; func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs, dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20)); } }
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) { class LevelsInit { public: Npp32s pLevels[256]; const Npp32s* pLevels3[3]; int nValues3[3]; #if (CUDA_VERSION > 4020) GpuMat d_pLevels; #endif LevelsInit() { nValues3[0] = nValues3[1] = nValues3[2] = 256; for (int i = 0; i < 256; ++i) pLevels[i] = i; #if (CUDA_VERSION <= 4020) pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; #else d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>(); #endif } }; static LevelsInit lvls; int cn = src.channels(); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3); CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous()); dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn)); NppiSize sz; sz.height = src.rows; sz.width = src.cols; Mat nppLut; lut.convertTo(nppLut, CV_32S); cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); if (src.type() == CV_8UC1) { #if (CUDA_VERSION <= 4020) nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) ); #else GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) ); #endif } else { const Npp32s* pValues3[3]; Mat nppLut3[3]; if (nppLut.channels() == 1) { #if (CUDA_VERSION <= 4020) pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>(); #else GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>(); #endif } else { cv::split(nppLut, nppLut3); #if (CUDA_VERSION <= 4020) pValues3[0] = nppLut3[0].ptr<Npp32s>(); pValues3[1] = nppLut3[1].ptr<Npp32s>(); pValues3[2] = nppLut3[2].ptr<Npp32s>(); #else GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data)); GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data)); GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data)); pValues3[0] = d_nppLut0.ptr<Npp32s>(); pValues3[1] = d_nppLut1.ptr<Npp32s>(); pValues3[2] = d_nppLut2.ptr<Npp32s>(); #endif } nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) ); } if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask, Stream& stream) { if (query.empty() || train.empty()) return; using namespace ::cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); static const caller_t callers[3][6] = { { matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, matchL1_gpu<unsigned short>, matchL1_gpu<short>, matchL1_gpu<int>, matchL1_gpu<float> }, { 0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 0/*matchL2_gpu<int>*/, matchL2_gpu<float> }, { matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ } }; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); const int nQuery = query.rows; const int nTrain = train.rows; if (k == 2) { ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); } else { ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx); ensureSizeIsEnough(nQuery, k, CV_32F, distance); ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); } if (stream) stream.enqueueMemSet(trainIdx, Scalar::all(-1)); else trainIdx.setTo(Scalar::all(-1)); caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); }
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream) { CV_Assert( src.channels() <= 4 ); CV_Assert( dim == 0 || dim == 1 ); CV_Assert( reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN ); if (dtype < 0) dtype = src.depth(); dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); if (dim == 0) { typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream); static const func_t funcs[7][7] = { { ::reduce::rows<unsigned char, int, unsigned char>, 0/*::reduce::rows<unsigned char, int, signed char>*/, 0/*::reduce::rows<unsigned char, int, unsigned short>*/, 0/*::reduce::rows<unsigned char, int, short>*/, ::reduce::rows<unsigned char, int, int>, ::reduce::rows<unsigned char, float, float>, ::reduce::rows<unsigned char, double, double> }, { 0/*::reduce::rows<signed char, int, unsigned char>*/, 0/*::reduce::rows<signed char, int, signed char>*/, 0/*::reduce::rows<signed char, int, unsigned short>*/, 0/*::reduce::rows<signed char, int, short>*/, 0/*::reduce::rows<signed char, int, int>*/, 0/*::reduce::rows<signed char, float, float>*/, 0/*::reduce::rows<signed char, double, double>*/ }, { 0/*::reduce::rows<unsigned short, int, unsigned char>*/, 0/*::reduce::rows<unsigned short, int, signed char>*/, ::reduce::rows<unsigned short, int, unsigned short>, 0/*::reduce::rows<unsigned short, int, short>*/, ::reduce::rows<unsigned short, int, int>, ::reduce::rows<unsigned short, float, float>, ::reduce::rows<unsigned short, double, double> }, { 0/*::reduce::rows<short, int, unsigned char>*/, 0/*::reduce::rows<short, int, signed char>*/, 0/*::reduce::rows<short, int, unsigned short>*/, ::reduce::rows<short, int, short>, ::reduce::rows<short, int, int>, ::reduce::rows<short, float, float>, ::reduce::rows<short, double, double> }, { 0/*::reduce::rows<int, int, unsigned char>*/, 0/*::reduce::rows<int, int, signed char>*/, 0/*::reduce::rows<int, int, unsigned short>*/, 0/*::reduce::rows<int, int, short>*/, ::reduce::rows<int, int, int>, ::reduce::rows<int, float, float>, ::reduce::rows<int, double, double> }, { 0/*::reduce::rows<float, float, unsigned char>*/, 0/*::reduce::rows<float, float, signed char>*/, 0/*::reduce::rows<float, float, unsigned short>*/, 0/*::reduce::rows<float, float, short>*/, 0/*::reduce::rows<float, float, int>*/, ::reduce::rows<float, float, float>, ::reduce::rows<float, double, double> }, { 0/*::reduce::rows<double, double, unsigned char>*/, 0/*::reduce::rows<double, double, signed char>*/, 0/*::reduce::rows<double, double, unsigned short>*/, 0/*::reduce::rows<double, double, short>*/, 0/*::reduce::rows<double, double, int>*/, 0/*::reduce::rows<double, double, float>*/, ::reduce::rows<double, double, double> } }; const func_t func = funcs[src.depth()][dst.depth()]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats"); func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream)); } else { typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); static const func_t funcs[7][7] = { { ::reduce::cols<unsigned char, int, unsigned char>, 0/*::reduce::cols<unsigned char, int, signed char>*/, 0/*::reduce::cols<unsigned char, int, unsigned short>*/, 0/*::reduce::cols<unsigned char, int, short>*/, ::reduce::cols<unsigned char, int, int>, ::reduce::cols<unsigned char, float, float>, ::reduce::cols<unsigned char, double, double> }, { 0/*::reduce::cols<signed char, int, unsigned char>*/, 0/*::reduce::cols<signed char, int, signed char>*/, 0/*::reduce::cols<signed char, int, unsigned short>*/, 0/*::reduce::cols<signed char, int, short>*/, 0/*::reduce::cols<signed char, int, int>*/, 0/*::reduce::cols<signed char, float, float>*/, 0/*::reduce::cols<signed char, double, double>*/ }, { 0/*::reduce::cols<unsigned short, int, unsigned char>*/, 0/*::reduce::cols<unsigned short, int, signed char>*/, ::reduce::cols<unsigned short, int, unsigned short>, 0/*::reduce::cols<unsigned short, int, short>*/, ::reduce::cols<unsigned short, int, int>, ::reduce::cols<unsigned short, float, float>, ::reduce::cols<unsigned short, double, double> }, { 0/*::reduce::cols<short, int, unsigned char>*/, 0/*::reduce::cols<short, int, signed char>*/, 0/*::reduce::cols<short, int, unsigned short>*/, ::reduce::cols<short, int, short>, ::reduce::cols<short, int, int>, ::reduce::cols<short, float, float>, ::reduce::cols<short, double, double> }, { 0/*::reduce::cols<int, int, unsigned char>*/, 0/*::reduce::cols<int, int, signed char>*/, 0/*::reduce::cols<int, int, unsigned short>*/, 0/*::reduce::cols<int, int, short>*/, ::reduce::cols<int, int, int>, ::reduce::cols<int, float, float>, ::reduce::cols<int, double, double> }, { 0/*::reduce::cols<float, float, unsigned char>*/, 0/*::reduce::cols<float, float, signed char>*/, 0/*::reduce::cols<float, float, unsigned short>*/, 0/*::reduce::cols<float, float, short>*/, 0/*::reduce::cols<float, float, int>*/, ::reduce::cols<float, float, float>, ::reduce::cols<float, double, double> }, { 0/*::reduce::cols<double, double, unsigned char>*/, 0/*::reduce::cols<double, double, signed char>*/, 0/*::reduce::cols<double, double, unsigned short>*/, 0/*::reduce::cols<double, double, short>*/, 0/*::reduce::cols<double, double, int>*/, 0/*::reduce::cols<double, double, float>*/, ::reduce::cols<double, double, double> } }; const func_t func = funcs[src.depth()][dst.depth()]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats"); func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream)); } }
void cv::gpu::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP ); _dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); GpuMat dst = _dst.getGpuMat(); cudaStream_t stream = StreamAccessor::getStream(_stream); if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1)) { NppiSize srcsz; srcsz.width = src.cols; srcsz.height = src.rows; NppiSize dstsz; dstsz.width = dst.cols; dstsz.height = dst.rows; NppStreamHandler h(stream); switch (src.type()) { case CV_8UC1: { Npp8u nVal = saturate_cast<Npp8u>(value[0]); nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_8UC4: { Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])}; nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_32SC1: { Npp32s nVal = saturate_cast<Npp32s>(value[0]); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_32FC1: { Npp32f val = saturate_cast<Npp32f>(value[0]); Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val)); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } } if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else { typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream); static const caller_t callers[6][4] = { { copyMakeBorder_caller<uchar, 1> , copyMakeBorder_caller<uchar, 2> , copyMakeBorder_caller<uchar, 3> , copyMakeBorder_caller<uchar, 4>}, {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/}, { copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/, copyMakeBorder_caller<ushort, 3> , copyMakeBorder_caller<ushort, 4>}, { copyMakeBorder_caller<short, 1> , 0/*copyMakeBorder_caller<short, 2>*/ , copyMakeBorder_caller<short, 3> , copyMakeBorder_caller<short, 4>}, {0/*copyMakeBorder_caller<int, 1>*/, 0/*copyMakeBorder_caller<int, 2>*/ , 0/*copyMakeBorder_caller<int, 3>*/, 0/*copyMakeBorder_caller<int , 4>*/}, { copyMakeBorder_caller<float, 1> , 0/*copyMakeBorder_caller<float, 2>*/ , copyMakeBorder_caller<float, 3> , copyMakeBorder_caller<float ,4>} }; caller_t func = callers[src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, dst, top, left, borderType, value, stream); } }
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2], GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4)); const Scalar zero = Scalar::all(0); cudaStream_t cudaStream = StreamAccessor::getStream(stream); //////////////////////////////////////////////////////////////////////////////////////////// // Init int rows = left.rows; int cols = left.cols; rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); int levels = rthis.levels; AutoBuffer<int> buf(levels * 4); int* cols_pyr = buf; int* rows_pyr = cols_pyr + levels; int* nr_plane_pyr = rows_pyr + levels; int* step_pyr = nr_plane_pyr + levels; cols_pyr[0] = cols; rows_pyr[0] = rows; nr_plane_pyr[0] = rthis.nr_plane; const int n = 64; step_pyr[0] = static_cast<int>(alignSize(cols * sizeof(T), n) / sizeof(T)); for (int i = 1; i < levels; i++) { cols_pyr[i] = (cols_pyr[i-1] + 1) / 2; rows_pyr[i] = (rows_pyr[i-1] + 1) / 2; nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2; step_pyr[i] = static_cast<int>(alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T)); } Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]); Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2); u[0].create(msg_size, DataType<T>::type); d[0].create(msg_size, DataType<T>::type); l[0].create(msg_size, DataType<T>::type); r[0].create(msg_size, DataType<T>::type); u[1].create(msg_size, DataType<T>::type); d[1].create(msg_size, DataType<T>::type); l[1].create(msg_size, DataType<T>::type); r[1].create(msg_size, DataType<T>::type); disp_selected_pyr[0].create(msg_size, DataType<T>::type); disp_selected_pyr[1].create(msg_size, DataType<T>::type); data_cost.create(data_cost_size, DataType<T>::type); data_cost_selected.create(msg_size, DataType<T>::type); step_pyr[0] = static_cast<int>(data_cost.step / sizeof(T)); Size temp_size = data_cost_size; if (data_cost_size.width * data_cost_size.height < step_pyr[levels - 1] * rows_pyr[levels - 1] * rthis.ndisp) temp_size = Size(step_pyr[levels - 1], rows_pyr[levels - 1] * rthis.ndisp); temp.create(temp_size, DataType<T>::type); //////////////////////////////////////////////////////////////////////////// // Compute load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); if (stream) { stream.enqueueMemSet(l[0], zero); stream.enqueueMemSet(d[0], zero); stream.enqueueMemSet(r[0], zero); stream.enqueueMemSet(u[0], zero); stream.enqueueMemSet(l[1], zero); stream.enqueueMemSet(d[1], zero); stream.enqueueMemSet(r[1], zero); stream.enqueueMemSet(u[1], zero); stream.enqueueMemSet(data_cost, zero); stream.enqueueMemSet(data_cost_selected, zero); } else { l[0].setTo(zero); d[0].setTo(zero); r[0].setTo(zero); u[0].setTo(zero); l[1].setTo(zero); d[1].setTo(zero); r[1].setTo(zero); u[1].setTo(zero); data_cost.setTo(zero); data_cost_selected.setTo(zero); } int cur_idx = 0; for (int i = levels - 1; i >= 0; i--) { if (i == levels - 1) { init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream); } else { compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream); int new_idx = (cur_idx + 1) & 1; init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(), u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream); cur_idx = new_idx; } calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream); } if (disp.empty()) disp.create(rows, cols, CV_16S); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); if (stream) stream.enqueueMemSet(out, zero); else out.setTo(zero); compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream); if (disp.type() != CV_16S) { if (stream) stream.enqueueConvert(out, disp, disp.type()); else out.convertTo(disp, disp.type()); } }