void cv::gpu::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream) { GpuMat src = _src.getGpuMat(); CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP ); _dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); GpuMat dst = _dst.getGpuMat(); cudaStream_t stream = StreamAccessor::getStream(_stream); if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1)) { NppiSize srcsz; srcsz.width = src.cols; srcsz.height = src.rows; NppiSize dstsz; dstsz.width = dst.cols; dstsz.height = dst.rows; NppStreamHandler h(stream); switch (src.type()) { case CV_8UC1: { Npp8u nVal = saturate_cast<Npp8u>(value[0]); nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_8UC4: { Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])}; nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_32SC1: { Npp32s nVal = saturate_cast<Npp32s>(value[0]); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } case CV_32FC1: { Npp32f val = saturate_cast<Npp32f>(value[0]); Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val)); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz, dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) ); break; } } if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else { typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream); static const caller_t callers[6][4] = { { copyMakeBorder_caller<uchar, 1> , copyMakeBorder_caller<uchar, 2> , copyMakeBorder_caller<uchar, 3> , copyMakeBorder_caller<uchar, 4>}, {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/}, { copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/, copyMakeBorder_caller<ushort, 3> , copyMakeBorder_caller<ushort, 4>}, { copyMakeBorder_caller<short, 1> , 0/*copyMakeBorder_caller<short, 2>*/ , copyMakeBorder_caller<short, 3> , copyMakeBorder_caller<short, 4>}, {0/*copyMakeBorder_caller<int, 1>*/, 0/*copyMakeBorder_caller<int, 2>*/ , 0/*copyMakeBorder_caller<int, 3>*/, 0/*copyMakeBorder_caller<int , 4>*/}, { copyMakeBorder_caller<float, 1> , 0/*copyMakeBorder_caller<float, 2>*/ , copyMakeBorder_caller<float, 3> , copyMakeBorder_caller<float ,4>} }; caller_t func = callers[src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, dst, top, left, borderType, value, stream); } }
void cv::gpu::FarnebackOpticalFlow::operator ()( const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s) { CV_Assert(frame0.type() == CV_8U && frame1.type() == CV_8U); CV_Assert(frame0.size() == frame1.size()); CV_Assert(polyN == 5 || polyN == 7); CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6); Stream streams[5]; if (S(s)) streams[0] = s; Size size = frame0.size(); GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY; flowx.create(size, CV_32F); flowy.create(size, CV_32F); GpuMat flowx0 = flowx; GpuMat flowy0 = flowy; // Crop unnecessary levels double scale = 1; int numLevelsCropped = 0; for (; numLevelsCropped < numLevels; numLevelsCropped++) { scale *= pyrScale; if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE) break; } streams[0].enqueueConvert(frame0, frames_[0], CV_32F); streams[1].enqueueConvert(frame1, frames_[1], CV_32F); if (fastPyramids) { // Build Gaussian pyramids using pyrDown() pyramid0_.resize(numLevelsCropped + 1); pyramid1_.resize(numLevelsCropped + 1); pyramid0_[0] = frames_[0]; pyramid1_[0] = frames_[1]; for (int i = 1; i <= numLevelsCropped; ++i) { pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]); pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]); } } setPolynomialExpansionConsts(polyN, polySigma); device::optflow_farneback::setUpdateMatricesConsts(); for (int k = numLevelsCropped; k >= 0; k--) { streams[0].waitForCompletion(); scale = 1; for (int i = 0; i < k; i++) scale *= pyrScale; double sigma = (1./scale - 1) * 0.5; int smoothSize = cvRound(sigma*5) | 1; smoothSize = std::max(smoothSize, 3); int width = cvRound(size.width*scale); int height = cvRound(size.height*scale); if (fastPyramids) { width = pyramid0_[k].cols; height = pyramid0_[k].rows; } if (k > 0) { curFlowX.create(height, width, CV_32F); curFlowY.create(height, width, CV_32F); } else { curFlowX = flowx0; curFlowY = flowy0; } if (!prevFlowX.data) { if (flags & OPTFLOW_USE_INITIAL_FLOW) { #if ENABLE_GPU_RESIZE resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale); streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale); #else Mat tmp1, tmp2; flowx0.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA); tmp2 *= scale; curFlowX.upload(tmp2); flowy0.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA); tmp2 *= scale; curFlowY.upload(tmp2); #endif } else { streams[0].enqueueMemSet(curFlowX, 0); streams[1].enqueueMemSet(curFlowY, 0); } } else { #if ENABLE_GPU_RESIZE resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale); streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale); #else Mat tmp1, tmp2; prevFlowX.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR); tmp2 *= 1./pyrScale; curFlowX.upload(tmp2); prevFlowY.download(tmp1); resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR); tmp2 *= 1./pyrScale; curFlowY.upload(tmp2); #endif } GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_); GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_); GpuMat R[2] = { allocMatFromBuf(5*height, width, CV_32F, R_[0]), allocMatFromBuf(5*height, width, CV_32F, R_[1]) }; if (fastPyramids) { device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0])); device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1])); } else { GpuMat blurredFrame[2] = { allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]), allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1]) }; GpuMat pyrLevel[2] = { allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]), allocMatFromBuf(height, width, CV_32F, pyrLevel_[1]) }; Mat g = getGaussianKernel(smoothSize, sigma, CV_32F); device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2); for (int i = 0; i < 2; i++) { device::optflow_farneback::gaussianBlurGpu( frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101_GPU, S(streams[i])); #if ENABLE_GPU_RESIZE resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]); #else Mat tmp1, tmp2; tmp[i].download(tmp1); resize(tmp1, tmp2, Size(width, height), INTER_LINEAR); I[i].upload(tmp2); #endif device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i])); } } streams[1].waitForCompletion(); device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0])); if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) { Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F); device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2); } for (int i = 0; i < numIters; i++) { if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); else updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); } prevFlowX = curFlowX; prevFlowY = curFlowY; } flowx = curFlowX; flowy = curFlowY; if (!S(s)) streams[0].waitForCompletion(); }
void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream) { GpuMat src = _src.getGpuMat(); Mat M = _M.getMat(); CV_Assert( M.rows == 3 && M.cols == 3 ); const int interpolation = flags & INTER_MAX; CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ; _dst.create(dsize, src.type()); GpuMat dst = _dst.getGpuMat(); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); static const bool useNppTab[6][4][3] = { { {false, false, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} } }; bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation]; // NPP bug on float data useNpp = useNpp && src.depth() != CV_32F; if (useNpp) { typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream); static const func_t funcs[2][6][4] = { { {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call} }, { {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call} } }; dst.setTo(borderValue, stream); double coeffs[3][3]; Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream)); } else { using namespace cv::cuda::device::imgproc; typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20); static const func_t funcs[6][4] = { {warpPerspective_gpu<uchar> , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3> , warpPerspective_gpu<uchar4> }, {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/ , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/}, {warpPerspective_gpu<ushort> , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3> , warpPerspective_gpu<ushort4> }, {warpPerspective_gpu<short> , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3> , warpPerspective_gpu<short4> }, {0 /*warpPerspective_gpu<int>*/ , 0 /*warpPerspective_gpu<int2>*/ , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ }, {warpPerspective_gpu<float> , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3> , warpPerspective_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); float coeffs[3 * 3]; Mat coeffsMat(3, 3, CV_32F, (void*)coeffs); if (flags & WARP_INVERSE_MAP) M.convertTo(coeffsMat, coeffsMat.type()); else { cv::Mat iM; invert(M, iM); iM.convertTo(coeffsMat, coeffsMat.type()); } Scalar_<float> borderValueFloat; borderValueFloat = borderValue; func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs, dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20)); } }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask, Stream& stream) { if (query.empty() || train.empty()) return; using namespace ::cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); static const caller_t callers[3][6] = { { matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, matchL1_gpu<unsigned short>, matchL1_gpu<short>, matchL1_gpu<int>, matchL1_gpu<float> }, { 0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 0/*matchL2_gpu<int>*/, matchL2_gpu<float> }, { matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ } }; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); const int nQuery = query.rows; const int nTrain = train.rows; if (k == 2) { ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); } else { ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx); ensureSizeIsEnough(nQuery, k, CV_32F, distance); ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); } if (stream) stream.enqueueMemSet(trainIdx, Scalar::all(-1)); else trainIdx.setTo(Scalar::all(-1)); caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); }
void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst) { // if not -> allocation will be done, but after that dst will not point to page locked memory CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() ); devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost); }
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4)); const Scalar zero = Scalar::all(0); cudaStream_t cudaStream = StreamAccessor::getStream(stream); //////////////////////////////////////////////////////////////////////////////////////////// // Init int rows = left.rows; int cols = left.cols; rthis.levels = std::min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); int levels = rthis.levels; // compute sizes AutoBuffer<int> buf(levels * 3); int* cols_pyr = buf; int* rows_pyr = cols_pyr + levels; int* nr_plane_pyr = rows_pyr + levels; cols_pyr[0] = cols; rows_pyr[0] = rows; nr_plane_pyr[0] = rthis.nr_plane; for (int i = 1; i < levels; i++) { cols_pyr[i] = cols_pyr[i-1] / 2; rows_pyr[i] = rows_pyr[i-1] / 2; nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2; } GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected; //allocate buffers int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2 buffers_count += 2; // data_cost has twice more rows than other buffers, what's why +2, not +1; buffers_count += 1; // data_cost_selected mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type); data_cost = mbuf.rowRange(0, rows * rthis.nr_plane * 2); data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane); for(int k = 0; k < 2; ++k) // in/out { GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows); GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2); GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] }; for(int _r = 0; _r < 5; ++_r) { *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5); assert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane); } }; size_t elem_step = mbuf.step / sizeof(T); Size temp_size = data_cost.size(); if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp) temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp); temp.create(temp_size, DataType<T>::type); //////////////////////////////////////////////////////////////////////////// // Compute load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); if (stream) { stream.enqueueMemSet(l[0], zero); stream.enqueueMemSet(d[0], zero); stream.enqueueMemSet(r[0], zero); stream.enqueueMemSet(u[0], zero); stream.enqueueMemSet(l[1], zero); stream.enqueueMemSet(d[1], zero); stream.enqueueMemSet(r[1], zero); stream.enqueueMemSet(u[1], zero); stream.enqueueMemSet(data_cost, zero); stream.enqueueMemSet(data_cost_selected, zero); } else { l[0].setTo(zero); d[0].setTo(zero); r[0].setTo(zero); u[0].setTo(zero); l[1].setTo(zero); d[1].setTo(zero); r[1].setTo(zero); u[1].setTo(zero); data_cost.setTo(zero); data_cost_selected.setTo(zero); } int cur_idx = 0; for (int i = levels - 1; i >= 0; i--) { if (i == levels - 1) { init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream); } else { compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step, left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream); int new_idx = (cur_idx + 1) & 1; init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(), u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream); cur_idx = new_idx; } calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream); } if (disp.empty()) disp.create(rows, cols, CV_16S); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); if (stream) stream.enqueueMemSet(out, zero); else out.setTo(zero); compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream); if (disp.type() != CV_16S) { if (stream) stream.enqueueConvert(out, disp, disp.type()); else out.convertTo(disp, disp.type()); } }
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf) { using namespace ::cv::gpu::device::matrix_reductions::minmaxloc; typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); static Caller multipass_callers[] = { minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 }; static Caller singlepass_callers[] = { minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> }; static MaskedCaller masked_multipass_callers[] = { minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>, minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 }; static MaskedCaller masked_singlepass_callers[] = { minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double> }; CV_Assert(src.depth() <= CV_64F); CV_Assert(src.channels() == 1); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); if (src.depth() == CV_64F) { if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } double minVal_; if (!minVal) minVal = &minVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_; int minLoc_[2]; int maxLoc_[2]; Size valbuf_size, locbuf_size; getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height); ensureSizeIsEnough(valbuf_size, CV_8U, valBuf); ensureSizeIsEnough(locbuf_size, CV_8U, locBuf); if (mask.empty()) { Caller* callers = multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = singlepass_callers; Caller caller = callers[src.type()]; CV_Assert(caller != 0); caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); } else { MaskedCaller* callers = masked_multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = masked_singlepass_callers; MaskedCaller caller = callers[src.type()]; CV_Assert(caller != 0); caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); } if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; } if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; } }
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) { class LevelsInit { public: Npp32s pLevels[256]; const Npp32s* pLevels3[3]; int nValues3[3]; #if (CUDA_VERSION > 4020) GpuMat d_pLevels; #endif LevelsInit() { nValues3[0] = nValues3[1] = nValues3[2] = 256; for (int i = 0; i < 256; ++i) pLevels[i] = i; #if (CUDA_VERSION <= 4020) pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; #else d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>(); #endif } }; static LevelsInit lvls; int cn = src.channels(); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3); CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous()); dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn)); NppiSize sz; sz.height = src.rows; sz.width = src.cols; Mat nppLut; lut.convertTo(nppLut, CV_32S); cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); if (src.type() == CV_8UC1) { #if (CUDA_VERSION <= 4020) nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) ); #else GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) ); #endif } else { const Npp32s* pValues3[3]; Mat nppLut3[3]; if (nppLut.channels() == 1) { #if (CUDA_VERSION <= 4020) pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>(); #else GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>(); #endif } else { cv::split(nppLut, nppLut3); #if (CUDA_VERSION <= 4020) pValues3[0] = nppLut3[0].ptr<Npp32s>(); pValues3[1] = nppLut3[1].ptr<Npp32s>(); pValues3[2] = nppLut3[2].ptr<Npp32s>(); #else GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data)); GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data)); GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data)); pValues3[0] = d_nppLut0.ptr<Npp32s>(); pValues3[1] = d_nppLut1.ptr<Npp32s>(); pValues3[2] = d_nppLut2.ptr<Npp32s>(); #endif } nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) ); } if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s) { CV_Assert(M.rows == 3 && M.cols == 3); int interpolation = flags & INTER_MAX; CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC); CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); static const bool useNppTab[6][4][3] = { { {false, false, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, false} }, { {false, false, false}, {false, false, false}, {false, false, false}, {false, false, false} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} }, { {false, true, true}, {false, false, false}, {false, true, true}, {false, false, true} } }; bool useNpp = borderMode == BORDER_CONSTANT; useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation]; #ifdef linux // NPP bug on float data useNpp = useNpp && src.depth() != CV_32F; #endif if (useNpp) { typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream); static const func_t funcs[2][6][4] = { { {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call} }, { {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call}, {0, 0, 0, 0}, {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call}, {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call} } }; double coeffs[3][3]; Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); M.convertTo(coeffsMat, coeffsMat.type()); const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s)); } else { using namespace cv::gpu::device::imgproc; typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); static const func_t funcs[6][4] = { {warpPerspective_gpu<uchar> , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3> , warpPerspective_gpu<uchar4> }, {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/ , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/}, {warpPerspective_gpu<ushort> , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3> , warpPerspective_gpu<ushort4> }, {warpPerspective_gpu<short> , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3> , warpPerspective_gpu<short4> }, {0 /*warpPerspective_gpu<int>*/ , 0 /*warpPerspective_gpu<int2>*/ , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ }, {warpPerspective_gpu<float> , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3> , warpPerspective_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType)); dst.create(dsize, src.type()); float coeffs[3 * 3]; Mat coeffsMat(3, 3, CV_32F, (void*)coeffs); if (flags & WARP_INVERSE_MAP) M.convertTo(coeffsMat, coeffsMat.type()); else { cv::Mat iM; invert(M, iM); iM.convertTo(coeffsMat, coeffsMat.type()); } Scalar_<float> borderValueFloat; borderValueFloat = borderValue; DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc); } }
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) { CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA); CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0)); if (dsize == Size()) dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy)); else { fx = static_cast<double>(dsize.width) / src.cols; fy = static_cast<double>(dsize.height) / src.rows; } if (dsize != dst.size()) dst.create(dsize, src.type()); if (dsize == src.size()) { if (s) s.enqueueCopy(src, dst); else src.copyTo(dst); return; } cudaStream_t stream = StreamAccessor::getStream(s); Size wholeSize; Point ofs; src.locateROI(wholeSize, ofs); bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4); useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || (src.type() == CV_8UC4 && interpolation != INTER_AREA)); if (useNpp) { typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize, double xFactor, double yFactor, int eInterpolation); const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R }; static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS}; NppiSize srcsz; srcsz.width = wholeSize.width; srcsz.height = wholeSize.height; NppiRect srcrect; srcrect.x = ofs.x; srcrect.y = ofs.y; srcrect.width = src.cols; srcrect.height = src.rows; NppiSize dstsz; dstsz.width = dst.cols; dstsz.height = dst.rows; NppStreamHandler h(stream); nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect, dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } else { using namespace ::cv::gpu::device::imgproc; typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { {resize_gpu<uchar> , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3> , resize_gpu<uchar4> }, {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/ , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/}, {resize_gpu<ushort> , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3> , resize_gpu<ushort4> }, {resize_gpu<short> , 0 /*resize_gpu<short2>*/ , resize_gpu<short3> , resize_gpu<short4> }, {0 /*resize_gpu<int>*/ , 0 /*resize_gpu<int2>*/ , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ }, {resize_gpu<float> , 0 /*resize_gpu<float2>*/ , resize_gpu<float3> , resize_gpu<float4> } }; const func_t func = funcs[src.depth()][src.channels() - 1]; CV_Assert(func != 0); func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream); } }
void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream) { #ifndef HAVE_CUFFT (void) _src; (void) _dst; (void) dft_size; (void) flags; (void) stream; throw_no_cuda(); #else GpuMat src = _src.getGpuMat(); CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); // We don't support unpacked output (in the case of real input) CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) ); const bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); const bool is_row_dft = (flags & DFT_ROWS) != 0; const bool is_scaled_dft = (flags & DFT_SCALE) != 0; const bool is_inverse = (flags & DFT_INVERSE) != 0; const bool is_complex_input = src.channels() == 2; const bool is_complex_output = !(flags & DFT_REAL_OUTPUT); // We don't support real-to-real transform CV_Assert( is_complex_input || is_complex_output ); GpuMat src_cont = src; // Make sure here we work with the continuous input, // as CUFFT can't handle gaps createContinuous(src.rows, src.cols, src.type(), src_cont); if (src_cont.data != src.data) src.copyTo(src_cont, stream); Size dft_size_opt = dft_size; if (is_1d_input && !is_row_dft) { // If the source matrix is single column handle it as single row dft_size_opt.width = std::max(dft_size.width, dft_size.height); dft_size_opt.height = std::min(dft_size.width, dft_size.height); } CV_Assert( dft_size_opt.width > 1 ); cufftType dft_type = CUFFT_R2C; if (is_complex_input) dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; cufftHandle plan; if (is_1d_input || is_row_dft) cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) ); else cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) ); cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); if (is_complex_input) { if (is_complex_output) { createContinuous(dft_size, CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2C( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(), is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); } else { createContinuous(dft_size, CV_32F, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecC2R( plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>())); } } else { // We could swap dft_size for efficiency. Here we must reflect it if (dft_size == dft_size_opt) createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst); else createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst); GpuMat dst = _dst.getGpuMat(); cufftSafeCall(cufftExecR2C( plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>())); } cufftSafeCall( cufftDestroy(plan) ); if (is_scaled_dft) cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream); #endif }
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) { using namespace mathfunc::minmax; typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); static Caller multipass_callers[7] = { minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 }; static Caller singlepass_callers[7] = { minMaxCaller<unsigned char>, minMaxCaller<char>, minMaxCaller<unsigned short>, minMaxCaller<short>, minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> }; static MaskedCaller masked_multipass_callers[7] = { minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>, minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 }; static MaskedCaller masked_singlepass_callers[7] = { minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> }; CV_Assert(src.channels() == 1); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); double minVal_; if (!minVal) minVal = &minVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_; Size buf_size; getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height); ensureSizeIsEnough(buf_size, CV_8U, buf); if (mask.empty()) { Caller* callers = multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = singlepass_callers; Caller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); caller(src, minVal, maxVal, buf); } else { MaskedCaller* callers = masked_multipass_callers; if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) callers = masked_singlepass_callers; MaskedCaller caller = callers[src.type()]; if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); caller(src, mask, minVal, maxVal, buf); } }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream) { CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters); CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3)); operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream); }
Mat visionUtils::skinDetect(Mat captureframe, Mat3b *skinDetectHSV, Mat *skinMask, std::vector<int> adaptiveHSV, int minPixelSize, int imgBlurPixels, int imgMorphPixels, int singleRegionChoice, bool displayFaces) { if (adaptiveHSV.size()!=6 || adaptiveHSV.empty()) { adaptiveHSV.clear(); adaptiveHSV.push_back(5); adaptiveHSV.push_back(38); adaptiveHSV.push_back(51); adaptiveHSV.push_back(17); adaptiveHSV.push_back(250); adaptiveHSV.push_back(242); } //int step = 0; Mat3b frameTemp; Mat3b frame; // Forcing resize to 640x480 -> all thresholds / pixel filters configured for this size..... // Note returned to original size at end... Size s = captureframe.size(); cv::resize(captureframe,captureframe,Size(640,480)); if (useGPU) { GpuMat imgGPU, imgGPUHSV; imgGPU.upload(captureframe); cv::cvtColor(imgGPU, imgGPUHSV, CV_BGR2HSV); GaussianBlur(imgGPUHSV, imgGPUHSV, Size(imgBlurPixels,imgBlurPixels), 1, 1); imgGPUHSV.download(frameTemp); } else { cv::cvtColor(captureframe, frameTemp, CV_BGR2HSV); GaussianBlur(frameTemp, frameTemp, Size(imgBlurPixels,imgBlurPixels), 1, 1); } // Potential FASTER VERSION using inRange Mat frameThreshold = Mat::zeros(frameTemp.rows,frameTemp.cols, CV_8UC1); Mat hsvMin = (Mat_<int>(1,3) << adaptiveHSV[0], adaptiveHSV[1],adaptiveHSV[2] ); Mat hsvMax = (Mat_<int>(1,3) << adaptiveHSV[3], adaptiveHSV[4],adaptiveHSV[5] ); inRange(frameTemp,hsvMin ,hsvMax, frameThreshold); frameTemp.copyTo(frame,frameThreshold); /* BGR CONVERSION AND THRESHOLD */ Mat1b frame_gray; // send HSV to skinDetectHSV for return *skinDetectHSV=frame.clone(); cv::cvtColor(frame, frame_gray, CV_BGR2GRAY); // Adaptive thresholding technique // 1. Threshold data to find main areas of skin adaptiveThreshold(frame_gray,frame_gray,255,ADAPTIVE_THRESH_GAUSSIAN_C,THRESH_BINARY_INV,9,1); if (useGPU) { GpuMat imgGPU; imgGPU.upload(frame_gray); // 2. Fill in thresholded areas #if CV_MAJOR_VERSION == 2 gpu::morphologyEx(imgGPU, imgGPU, CV_MOP_CLOSE, Mat1b(imgMorphPixels,imgMorphPixels,1), Point(-1, -1), 2); gpu::GaussianBlur(imgGPU, imgGPU, Size(imgBlurPixels,imgBlurPixels), 1, 1); #elif CV_MAJOR_VERSION == 3 //TODO: Check if that's correct Mat element = getStructuringElement(MORPH_RECT, Size(imgMorphPixels, imgMorphPixels), Point(-1, -1)); Ptr<cuda::Filter> closeFilter = cuda::createMorphologyFilter(MORPH_CLOSE, imgGPU.type(), element, Point(-1, -1), 2); closeFilter->apply(imgGPU, imgGPU); cv::Ptr<cv::cuda::Filter> gaussianFilter = cv::cuda::createGaussianFilter(imgGPU.type(), imgGPU.type(), Size(imgMorphPixels, imgMorphPixels), 1, 1); gaussianFilter->apply(imgGPU, imgGPU); #endif imgGPU.download(frame_gray); } else { // 2. Fill in thresholded areas morphologyEx(frame_gray, frame_gray, CV_MOP_CLOSE, Mat1b(imgMorphPixels,imgMorphPixels,1), Point(-1, -1), 2); GaussianBlur(frame_gray, frame_gray, Size(imgBlurPixels,imgBlurPixels), 1, 1); // Select single largest region from image, if singleRegionChoice is selected (1) } if (singleRegionChoice) { *skinMask = cannySegmentation(frame_gray, -1, displayFaces); } else // Detect each separate block and remove blobs smaller than a few pixels { *skinMask = cannySegmentation(frame_gray, minPixelSize, displayFaces); } // Just return skin Mat frame_skin; captureframe.copyTo(frame_skin,*skinMask); // Copy captureframe data to frame_skin, using mask from frame_ttt // Resize image to original before return cv::resize(frame_skin,frame_skin,s); if (displayFaces) { imshow("Skin HSV (B)",frame); imshow("Adaptive_threshold (D1)",frame_gray); imshow("Skin segmented",frame_skin); } return frame_skin; waitKey(1); }
void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask, Stream& stream) { if (query.empty() || train.empty()) return; using namespace cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); static const caller_t callersL1[] = { matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, matchL1_gpu<unsigned short>, matchL1_gpu<short>, matchL1_gpu<int>, matchL1_gpu<float> }; static const caller_t callersL2[] = { 0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 0/*matchL2_gpu<int>*/, matchL2_gpu<float> }; static const caller_t callersHamming[] = { matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ }; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING); const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming; const int nQuery = query.rows; const int nTrain = train.rows; if (k == 2) { ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); } else { ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx); ensureSizeIsEnough(nQuery, k, CV_32F, distance); ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); } if (stream) stream.enqueueMemSet(trainIdx, Scalar::all(-1)); else trainIdx.setTo(Scalar::all(-1)); caller_t func = callers[query.depth()]; CV_Assert(func != 0); func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream)); }
void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask) { using namespace cv::gpu::device::gfft; CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0); CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size())); if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); ensureSizeIsEnough(image.size(), CV_32F, eig_); if (useHarrisDetector) cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK); else cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3); double maxVal = 0; minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_); ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols); if (total == 0) { corners.release(); return; } sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total); if (minDistance < 1) tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners); else { vector<Point2f> tmp(total); Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]); tmpCorners_.colRange(0, total).download(tmpMat); vector<Point2f> tmp2; tmp2.reserve(total); const int cell_size = cvRound(minDistance); const int grid_width = (image.cols + cell_size - 1) / cell_size; const int grid_height = (image.rows + cell_size - 1) / cell_size; std::vector< std::vector<Point2f> > grid(grid_width * grid_height); for (int i = 0; i < total; ++i) { Point2f p = tmp[i]; bool good = true; int x_cell = static_cast<int>(p.x / cell_size); int y_cell = static_cast<int>(p.y / cell_size); int x1 = x_cell - 1; int y1 = y_cell - 1; int x2 = x_cell + 1; int y2 = y_cell + 1; // boundary check x1 = std::max(0, x1); y1 = std::max(0, y1); x2 = std::min(grid_width - 1, x2); y2 = std::min(grid_height - 1, y2); for (int yy = y1; yy <= y2; yy++) { for (int xx = x1; xx <= x2; xx++) { vector<Point2f>& m = grid[yy * grid_width + xx]; if (!m.empty()) { for(size_t j = 0; j < m.size(); j++) { float dx = p.x - m[j].x; float dy = p.y - m[j].y; if (dx * dx + dy * dy < minDistance * minDistance) { good = false; goto break_out; } } } } } break_out: if(good) { grid[y_cell * grid_width + x_cell].push_back(p); tmp2.push_back(p); if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners)) break; } } corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0])); } }
double cv::gpu::norm(const GpuMat& src1, int normType) { return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType); }
void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy) { CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 ); CV_Assert( I0.size() == I1.size() ); CV_Assert( I0.type() == I1.type() ); CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) ); CV_Assert( nscales > 0 ); // allocate memory for the pyramid structure I0s.resize(nscales); I1s.resize(nscales); u1s.resize(nscales); u2s.resize(nscales); u3s.resize(nscales); I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0); I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0); if (!useInitialFlow) { flowx.create(I0.size(), CV_32FC1); flowy.create(I0.size(), CV_32FC1); } u1s[0] = flowx; u2s[0] = flowy; if (gamma) u3s[0].create(I0.size(), CV_32FC1); I1x_buf.create(I0.size(), CV_32FC1); I1y_buf.create(I0.size(), CV_32FC1); I1w_buf.create(I0.size(), CV_32FC1); I1wx_buf.create(I0.size(), CV_32FC1); I1wy_buf.create(I0.size(), CV_32FC1); grad_buf.create(I0.size(), CV_32FC1); rho_c_buf.create(I0.size(), CV_32FC1); p11_buf.create(I0.size(), CV_32FC1); p12_buf.create(I0.size(), CV_32FC1); p21_buf.create(I0.size(), CV_32FC1); p22_buf.create(I0.size(), CV_32FC1); if (gamma) { p31_buf.create(I0.size(), CV_32FC1); p32_buf.create(I0.size(), CV_32FC1); } diff_buf.create(I0.size(), CV_32FC1); // create the scales for (int s = 1; s < nscales; ++s) { cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep); cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep); if (I0s[s].cols < 16 || I0s[s].rows < 16) { nscales = s; break; } if (useInitialFlow) { cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep); cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep); cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]); cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]); } else { u1s[s].create(I0s[s].size(), CV_32FC1); u2s[s].create(I0s[s].size(), CV_32FC1); } if (gamma) u3s[s].create(I0s[s].size(), CV_32FC1); } if (!useInitialFlow) { u1s[nscales-1].setTo(Scalar::all(0)); u2s[nscales-1].setTo(Scalar::all(0)); } if (gamma) u3s[nscales - 1].setTo(Scalar::all(0)); // pyramidal structure for computing the optical flow for (int s = nscales - 1; s >= 0; --s) { // compute the optical flow at the current scale procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s]); // if this was the last scale, finish now if (s == 0) break; // otherwise, upsample the optical flow // zoom the optical flow for the next finer scale cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size()); cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size()); if (gamma) cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size()); // scale the optical flow with the appropriate zoom factor cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]); cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]); } }
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) { #ifndef HAVE_CUBLAS (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream; CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS"); #else // CUBLAS works with column-major matrices CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type())); if (src1.depth() == CV_64F) { if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } bool tr1 = (flags & GEMM_1_T) != 0; bool tr2 = (flags & GEMM_2_T) != 0; bool tr3 = (flags & GEMM_3_T) != 0; if (src1.type() == CV_64FC2) { if (tr1 || tr2 || tr3) CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type"); } Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size(); Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size(); Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size(); Size dstSize(src2Size.width, src1Size.height); CV_Assert(src1Size.width == src2Size.height); CV_Assert(src3.empty() || src3Size == dstSize); dst.create(dstSize, src1.type()); if (beta != 0) { if (src3.empty()) { if (stream) stream.enqueueMemSet(dst, Scalar::all(0)); else dst.setTo(Scalar::all(0)); } else { if (tr3) { transpose(src3, dst, stream); } else { if (stream) stream.enqueueCopy(src3, dst); else src3.copyTo(dst); } } } cublasHandle_t handle; cublasSafeCall( cublasCreate_v2(&handle) ); cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) ); cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) ); const float alphaf = static_cast<float>(alpha); const float betaf = static_cast<float>(beta); const cuComplex alphacf = make_cuComplex(alphaf, 0); const cuComplex betacf = make_cuComplex(betaf, 0); const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0); const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0); cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N; switch (src1.type()) { case CV_32FC1: cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphaf, src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)), src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)), &betaf, dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) ); break; case CV_64FC1: cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alpha, src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)), src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)), &beta, dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) ); break; case CV_32FC2: cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphacf, src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)), src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)), &betacf, dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) ); break; case CV_64FC2: cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, &alphac, src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)), src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)), &betac, dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) ); break; } cublasSafeCall( cublasDestroy_v2(handle) ); #endif }
void cv::cuda::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st) { CV_Assert( prev.type() == CV_8UC1 ); CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() ); const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width, (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height); velx.create(velSize, CV_32FC1); vely.create(velSize, CV_32FC1); // scanning scheme coordinates std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1)); int ssCount = 0; // Calculate scanning scheme const int minCount = std::min(maxRange.width, maxRange.height); // use spiral search pattern // // 9 10 11 12 // 8 1 2 13 // 7 * 3 14 // 6 5 4 15 //... 20 19 18 17 // for (int i = 0; i < minCount; ++i) { // four cycles along sides int x = -i - 1, y = x; // upper side for (int j = -i; j <= i + 1; ++j, ++ssCount) { ss[ssCount].x = (short) ++x; ss[ssCount].y = (short) y; } // right side for (int j = -i; j <= i + 1; ++j, ++ssCount) { ss[ssCount].x = (short) x; ss[ssCount].y = (short) ++y; } // bottom side for (int j = -i; j <= i + 1; ++j, ++ssCount) { ss[ssCount].x = (short) --x; ss[ssCount].y = (short) y; } // left side for (int j = -i; j <= i + 1; ++j, ++ssCount) { ss[ssCount].x = (short) x; ss[ssCount].y = (short) --y; } } // the rest part if (maxRange.width < maxRange.height) { const int xleft = -minCount; // cycle by neighbor rings for (int i = minCount; i < maxRange.height; ++i) { // two cycles by x int y = -(i + 1); int x = xleft; // upper side for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x) { ss[ssCount].x = (short) x; ss[ssCount].y = (short) y; } x = xleft; y = -y; // bottom side for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x) { ss[ssCount].x = (short) x; ss[ssCount].y = (short) y; } } } else if (maxRange.width > maxRange.height) { const int yupper = -minCount; // cycle by neighbor rings for (int i = minCount; i < maxRange.width; ++i) { // two cycles by y int x = -(i + 1); int y = yupper; // left side for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y) { ss[ssCount].x = (short) x; ss[ssCount].y = (short) y; } y = yupper; x = -x; // right side for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y) { ss[ssCount].x = (short) x; ss[ssCount].y = (short) y; } } } const cudaStream_t stream = StreamAccessor::getStream(st); ensureSizeIsEnough(1, ssCount, CV_16SC2, buf); if (stream == 0) cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) ); else cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) ); const int maxX = prev.cols - blockSize.width; const int maxY = prev.rows - blockSize.height; const int SMALL_DIFF = 2; const int BIG_DIFF = 128; const int blSize = blockSize.area(); const int acceptLevel = blSize * SMALL_DIFF; const int escapeLevel = blSize * BIG_DIFF; optflowbm::calc(prev, curr, velx, vely, make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious, maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream); }
void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err) { using namespace cv::gpu::device::pyrlk; if (prevPts.empty()) { nextPts.release(); status.release(); if (err) err->release(); return; } dim3 block, patch; calcPatchSize(winSize, block, patch, isDeviceArch11_); CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4); CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); CV_Assert(maxLevel >= 0); CV_Assert(winSize.width > 2 && winSize.height > 2); CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6); CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2); if (useInitialFlow) CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2); else ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); GpuMat temp2 = nextPts.reshape(1); multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); status.setTo(Scalar::all(1)); if (err) ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); // build the image pyramids. prevPyr_.resize(maxLevel + 1); nextPyr_.resize(maxLevel + 1); int cn = prevImg.channels(); if (cn == 1 || cn == 4) { prevImg.convertTo(prevPyr_[0], CV_32F); nextImg.convertTo(nextPyr_[0], CV_32F); } else { cvtColor(prevImg, dx_calcBuf_, COLOR_BGR2BGRA); dx_calcBuf_.convertTo(prevPyr_[0], CV_32F); cvtColor(nextImg, dx_calcBuf_, COLOR_BGR2BGRA); dx_calcBuf_.convertTo(nextPyr_[0], CV_32F); } for (int level = 1; level <= maxLevel; ++level) { pyrDown(prevPyr_[level - 1], prevPyr_[level]); pyrDown(nextPyr_[level - 1], nextPyr_[level]); } loadConstants(make_int2(winSize.width, winSize.height), iters); for (int level = maxLevel; level >= 0; level--) { if (cn == 1) { lkSparse1_gpu(prevPyr_[level], nextPyr_[level], prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols, level, block, patch); } else { lkSparse4_gpu(prevPyr_[level], nextPyr_[level], prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols, level, block, patch); } } }
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s) { #if (CUDA_VERSION < 5000) CV_Assert(terminals.type() == CV_32S); #else CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F); #endif Size src_size = terminals.size(); CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width)); CV_Assert(leftTransp.type() == terminals.type()); CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width)); CV_Assert(rightTransp.type() == terminals.type()); CV_Assert(top.size() == src_size); CV_Assert(top.type() == terminals.type()); CV_Assert(topLeft.size() == src_size); CV_Assert(topLeft.type() == terminals.type()); CV_Assert(topRight.size() == src_size); CV_Assert(topRight.type() == terminals.type()); CV_Assert(bottom.size() == src_size); CV_Assert(bottom.type() == terminals.type()); CV_Assert(bottomLeft.size() == src_size); CV_Assert(bottomLeft.type() == terminals.type()); CV_Assert(bottomRight.size() == src_size); CV_Assert(bottomRight.type() == terminals.type()); labels.create(src_size, CV_8U); NppiSize sznpp; sznpp.width = src_size.width; sznpp.height = src_size.height; int bufsz; nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) ); ensureSizeIsEnough(1, bufsz, CV_8U, buf); cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc); #if (CUDA_VERSION < 5000) nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(), bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(), static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) ); #else if (terminals.type() == CV_32S) { nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(), bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(), static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) ); } else { nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(), bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(), static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) ); } #endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const GpuMat& mask, Stream& stream) { if (query.empty() || train.empty()) return; using namespace cv::gpu::device::bf_radius_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); static const caller_t callers[3][6] = { { matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, matchL1_gpu<unsigned short>, matchL1_gpu<short>, matchL1_gpu<int>, matchL1_gpu<float> }, { 0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 0/*matchL2_gpu<int>*/, matchL2_gpu<float> }, { matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ } }; DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); const int nQuery = query.rows; const int nTrain = train.rows; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx); ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance); } if (stream) stream.enqueueMemSet(nMatches, Scalar::all(0)); else nMatches.setTo(Scalar::all(0)); caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); }
void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria) { CV_Assert(TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12)); CV_Assert(src.type() == CV_8UC4); const int nrows = src.rows; const int ncols = src.cols; const int hr = sr; const int hsp = sp; // Perform mean shift procedure and obtain region and spatial maps GpuMat h_rmap, h_spmap; meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria); Mat rmap = h_rmap; Mat spmap = h_spmap; Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1) + (nrows - 1) + (ncols - 1)); // Make region adjacent graph from image Vec4b r1; Vec4b r2[4]; Vec2s sp1; Vec2s sp2[4]; int dr[4]; int dsp[4]; for (int y = 0; y < nrows - 1; ++y) { Vec4b* ry = rmap.ptr<Vec4b>(y); Vec4b* ryp = rmap.ptr<Vec4b>(y + 1); Vec2s* spy = spmap.ptr<Vec2s>(y); Vec2s* spyp = spmap.ptr<Vec2s>(y + 1); for (int x = 0; x < ncols - 1; ++x) { r1 = ry[x]; sp1 = spy[x]; r2[0] = ry[x + 1]; r2[1] = ryp[x]; r2[2] = ryp[x + 1]; r2[3] = ryp[x]; sp2[0] = spy[x + 1]; sp2[1] = spyp[x]; sp2[2] = spyp[x + 1]; sp2[3] = spyp[x]; dr[0] = dist2(r1, r2[0]); dr[1] = dist2(r1, r2[1]); dr[2] = dist2(r1, r2[2]); dsp[0] = dist2(sp1, sp2[0]); dsp[1] = dist2(sp1, sp2[1]); dsp[2] = dist2(sp1, sp2[2]); r1 = ry[x + 1]; sp1 = spy[x + 1]; dr[3] = dist2(r1, r2[3]); dsp[3] = dist2(sp1, sp2[3]); g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0])); g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1])); g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2])); g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3])); } } for (int y = 0; y < nrows - 1; ++y) { r1 = rmap.at<Vec4b>(y, ncols - 1); r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1); sp1 = spmap.at<Vec2s>(y, ncols - 1); sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1); dr[0] = dist2(r1, r2[0]); dsp[0] = dist2(sp1, sp2[0]); g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0])); } for (int x = 0; x < ncols - 1; ++x) { r1 = rmap.at<Vec4b>(nrows - 1, x); r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1); sp1 = spmap.at<Vec2s>(nrows - 1, x); sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1); dr[0] = dist2(r1, r2[0]); dsp[0] = dist2(sp1, sp2[0]); g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0])); } DjSets comps(g.numv); // Find adjacent components for (int v = 0; v < g.numv; ++v) { for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next) { int c1 = comps.find(v); int c2 = comps.find(g.edges[e_it].to); if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp) comps.merge(c1, c2); } } vector<SegmLink> edges; edges.reserve(g.numv); // Prepare edges connecting differnet components for (int v = 0; v < g.numv; ++v) { int c1 = comps.find(v); for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next) { int c2 = comps.find(g.edges[e_it].to); if (c1 != c2) edges.push_back(SegmLink(c1, c2, g.edges[e_it].val)); } } // Sort all graph's edges connecting differnet components (in asceding order) sort(edges.begin(), edges.end()); // Exclude small components (starting from the nearest couple) for (size_t i = 0; i < edges.size(); ++i) { int c1 = comps.find(edges[i].from); int c2 = comps.find(edges[i].to); if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize)) comps.merge(c1, c2); } // Compute sum of the pixel's colors which are in the same segment Mat h_src = src; vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0)); for (int y = 0; y < nrows; ++y) { Vec4b* h_srcy = h_src.ptr<Vec4b>(y); for (int x = 0; x < ncols; ++x) { int parent = comps.find(pix(y, x, ncols)); Vec4b col = h_srcy[x]; Vec4i& sumcol = sumcols[parent]; sumcol[0] += col[0]; sumcol[1] += col[1]; sumcol[2] += col[2]; } } // Create final image, color of each segment is the average color of its pixels dst.create(src.size(), src.type()); for (int y = 0; y < nrows; ++y) { Vec4b* dsty = dst.ptr<Vec4b>(y); for (int x = 0; x < ncols; ++x) { int parent = comps.find(pix(y, x, ncols)); const Vec4i& sumcol = sumcols[parent]; Vec4b& dstcol = dsty[x]; dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]); dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]); dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]); } } }
void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2) { using namespace tvl1flow; const double scaledEpsilon = epsilon * epsilon * I0.size().area(); CV_DbgAssert( I1.size() == I0.size() ); CV_DbgAssert( I1.type() == I0.type() ); CV_DbgAssert( u1.size() == I0.size() ); CV_DbgAssert( u2.size() == u1.size() ); GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows)); centeredGradient(I1, I1x, I1y); GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows)); GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows)); p11.setTo(Scalar::all(0)); p12.setTo(Scalar::all(0)); p21.setTo(Scalar::all(0)); p22.setTo(Scalar::all(0)); GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows)); const float l_t = static_cast<float>(lambda * theta); const float taut = static_cast<float>(tau / theta); for (int warpings = 0; warpings < warps; ++warpings) { warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c); double error = std::numeric_limits<double>::max(); double prevError = 0.0; for (int n = 0; error > scaledEpsilon && n < iterations; ++n) { // some tweaks to make sum operation less frequently bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon); estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta), calcError); if (calcError) { error = gpu::sum(diff, norm_buf)[0]; prevError = error; } else { error = std::numeric_limits<double>::max(); prevError -= scaledEpsilon; } estimateDualVariables(u1, u2, p11, p12, p21, p22, taut); } } }
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2], GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) { CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4)); const Scalar zero = Scalar::all(0); cudaStream_t cudaStream = StreamAccessor::getStream(stream); //////////////////////////////////////////////////////////////////////////////////////////// // Init int rows = left.rows; int cols = left.cols; rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); int levels = rthis.levels; AutoBuffer<int> buf(levels * 4); int* cols_pyr = buf; int* rows_pyr = cols_pyr + levels; int* nr_plane_pyr = rows_pyr + levels; int* step_pyr = nr_plane_pyr + levels; cols_pyr[0] = cols; rows_pyr[0] = rows; nr_plane_pyr[0] = rthis.nr_plane; const int n = 64; step_pyr[0] = static_cast<int>(alignSize(cols * sizeof(T), n) / sizeof(T)); for (int i = 1; i < levels; i++) { cols_pyr[i] = (cols_pyr[i-1] + 1) / 2; rows_pyr[i] = (rows_pyr[i-1] + 1) / 2; nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2; step_pyr[i] = static_cast<int>(alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T)); } Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]); Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2); u[0].create(msg_size, DataType<T>::type); d[0].create(msg_size, DataType<T>::type); l[0].create(msg_size, DataType<T>::type); r[0].create(msg_size, DataType<T>::type); u[1].create(msg_size, DataType<T>::type); d[1].create(msg_size, DataType<T>::type); l[1].create(msg_size, DataType<T>::type); r[1].create(msg_size, DataType<T>::type); disp_selected_pyr[0].create(msg_size, DataType<T>::type); disp_selected_pyr[1].create(msg_size, DataType<T>::type); data_cost.create(data_cost_size, DataType<T>::type); data_cost_selected.create(msg_size, DataType<T>::type); step_pyr[0] = static_cast<int>(data_cost.step / sizeof(T)); Size temp_size = data_cost_size; if (data_cost_size.width * data_cost_size.height < step_pyr[levels - 1] * rows_pyr[levels - 1] * rthis.ndisp) temp_size = Size(step_pyr[levels - 1], rows_pyr[levels - 1] * rthis.ndisp); temp.create(temp_size, DataType<T>::type); //////////////////////////////////////////////////////////////////////////// // Compute load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); if (stream) { stream.enqueueMemSet(l[0], zero); stream.enqueueMemSet(d[0], zero); stream.enqueueMemSet(r[0], zero); stream.enqueueMemSet(u[0], zero); stream.enqueueMemSet(l[1], zero); stream.enqueueMemSet(d[1], zero); stream.enqueueMemSet(r[1], zero); stream.enqueueMemSet(u[1], zero); stream.enqueueMemSet(data_cost, zero); stream.enqueueMemSet(data_cost_selected, zero); } else { l[0].setTo(zero); d[0].setTo(zero); r[0].setTo(zero); u[0].setTo(zero); l[1].setTo(zero); d[1].setTo(zero); r[1].setTo(zero); u[1].setTo(zero); data_cost.setTo(zero); data_cost_selected.setTo(zero); } int cur_idx = 0; for (int i = levels - 1; i >= 0; i--) { if (i == levels - 1) { init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream); } else { compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream); int new_idx = (cur_idx + 1) & 1; init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(), u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream); cur_idx = new_idx; } calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream); } if (disp.empty()) disp.create(rows, cols, CV_16S); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); if (stream) stream.enqueueMemSet(out, zero); else out.setTo(zero); compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream); if (disp.type() != CV_16S) { if (stream) stream.enqueueConvert(out, disp, disp.type()); else out.convertTo(disp, disp.type()); } }