Beispiel #1
0
void cv::gpu::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
{
    GpuMat src = _src.getGpuMat();

    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );

    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
    GpuMat dst = _dst.getGpuMat();

    cudaStream_t stream = StreamAccessor::getStream(_stream);

    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
    {
        NppiSize srcsz;
        srcsz.width  = src.cols;
        srcsz.height = src.rows;

        NppiSize dstsz;
        dstsz.width  = dst.cols;
        dstsz.height = dst.rows;

        NppStreamHandler h(stream);

        switch (src.type())
        {
        case CV_8UC1:
            {
                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_8UC4:
            {
                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_32SC1:
            {
                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_32FC1:
            {
                Npp32f val = saturate_cast<Npp32f>(value[0]);
                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        }

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
    else
    {
        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
        };

        caller_t func = callers[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, dst, top, left, borderType, value, stream);
    }
}
void cv::gpu::FarnebackOpticalFlow::operator ()(
        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
{
    CV_Assert(frame0.type() == CV_8U && frame1.type() == CV_8U);
    CV_Assert(frame0.size() == frame1.size());
    CV_Assert(polyN == 5 || polyN == 7);
    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);

    Stream streams[5];
    if (S(s))
        streams[0] = s;

    Size size = frame0.size();
    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;

    flowx.create(size, CV_32F);
    flowy.create(size, CV_32F);
    GpuMat flowx0 = flowx;
    GpuMat flowy0 = flowy;

    // Crop unnecessary levels
    double scale = 1;
    int numLevelsCropped = 0;
    for (; numLevelsCropped < numLevels; numLevelsCropped++)
    {
        scale *= pyrScale;
        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
            break;
    }

    streams[0].enqueueConvert(frame0, frames_[0], CV_32F);
    streams[1].enqueueConvert(frame1, frames_[1], CV_32F);

    if (fastPyramids)
    {
        // Build Gaussian pyramids using pyrDown()
        pyramid0_.resize(numLevelsCropped + 1);
        pyramid1_.resize(numLevelsCropped + 1);
        pyramid0_[0] = frames_[0];
        pyramid1_[0] = frames_[1];
        for (int i = 1; i <= numLevelsCropped; ++i)
        {
            pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
            pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
        }
    }

    setPolynomialExpansionConsts(polyN, polySigma);
    device::optflow_farneback::setUpdateMatricesConsts();

    for (int k = numLevelsCropped; k >= 0; k--)
    {
        streams[0].waitForCompletion();

        scale = 1;
        for (int i = 0; i < k; i++)
            scale *= pyrScale;

        double sigma = (1./scale - 1) * 0.5;
        int smoothSize = cvRound(sigma*5) | 1;
        smoothSize = std::max(smoothSize, 3);

        int width = cvRound(size.width*scale);
        int height = cvRound(size.height*scale);

        if (fastPyramids)
        {
            width = pyramid0_[k].cols;
            height = pyramid0_[k].rows;
        }

        if (k > 0)
        {
            curFlowX.create(height, width, CV_32F);
            curFlowY.create(height, width, CV_32F);
        }
        else
        {
            curFlowX = flowx0;
            curFlowY = flowy0;
        }

        if (!prevFlowX.data)
        {
            if (flags & OPTFLOW_USE_INITIAL_FLOW)
            {
#if ENABLE_GPU_RESIZE
                resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
                resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
                streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale);
                streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale);
#else
                Mat tmp1, tmp2;
                flowx0.download(tmp1);
                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
                tmp2 *= scale;
                curFlowX.upload(tmp2);
                flowy0.download(tmp1);
                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
                tmp2 *= scale;
                curFlowY.upload(tmp2);
#endif
            }
            else
            {
                streams[0].enqueueMemSet(curFlowX, 0);
                streams[1].enqueueMemSet(curFlowY, 0);
            }
        }
        else
        {
#if ENABLE_GPU_RESIZE
            resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
            resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
            streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale);
            streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale);
#else
            Mat tmp1, tmp2;
            prevFlowX.download(tmp1);
            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
            tmp2 *= 1./pyrScale;
            curFlowX.upload(tmp2);
            prevFlowY.download(tmp1);
            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
            tmp2 *= 1./pyrScale;
            curFlowY.upload(tmp2);
#endif
        }

        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
        GpuMat R[2] =
        {
            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
            allocMatFromBuf(5*height, width, CV_32F, R_[1])
        };

        if (fastPyramids)
        {
            device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
            device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
        }
        else
        {
            GpuMat blurredFrame[2] =
            {
                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
            };
            GpuMat pyrLevel[2] =
            {
                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
            };

            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);           

            for (int i = 0; i < 2; i++)
            {
                device::optflow_farneback::gaussianBlurGpu(
                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101_GPU, S(streams[i]));
#if ENABLE_GPU_RESIZE
                resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]);
#else
                Mat tmp1, tmp2;
                tmp[i].download(tmp1);
                resize(tmp1, tmp2, Size(width, height), INTER_LINEAR);
                I[i].upload(tmp2);
#endif
                device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
            }
        }

        streams[1].waitForCompletion();
        device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));

        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
        {
            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
        }
        for (int i = 0; i < numIters; i++)
        {
            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
            else
                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
        }

        prevFlowX = curFlowX;
        prevFlowY = curFlowY;
    }

    flowx = curFlowX;
    flowy = curFlowY;    

    if (!S(s))
        streams[0].waitForCompletion();
}
Beispiel #3
0
void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)
{
    GpuMat src = _src.getGpuMat();
    Mat M = _M.getMat();

    CV_Assert( M.rows == 3 && M.cols == 3 );

    const int interpolation = flags & INTER_MAX;

    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ;

    _dst.create(dsize, src.type());
    GpuMat dst = _dst.getGpuMat();

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    static const bool useNppTab[6][4][3] =
    {
        {
            {false, false, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        }
    };

    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
    // NPP bug on float data
    useNpp = useNpp && src.depth() != CV_32F;

    if (useNpp)
    {
        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
            {
                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
            },
            {
                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
            }
        };

        dst.setTo(borderValue, stream);

        double coeffs[3][3];
        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());

        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream));
    }
    else
    {
        using namespace cv::cuda::device::imgproc;

        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        float coeffs[3 * 3];
        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);

        if (flags & WARP_INVERSE_MAP)
            M.convertTo(coeffsMat, coeffsMat.type());
        else
        {
            cv::Mat iM;
            invert(M, iM);
            iM.convertTo(coeffsMat, coeffsMat.type());
        }

        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
            dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
    }
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, const GpuMat& train,
    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
    const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace ::cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    if (k == 2)
    {
        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
    }
    else
    {
        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
    }

    if (stream)
        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
    else
        trainIdx.setTo(Scalar::all(-1));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

    func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
}
Beispiel #5
0
void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
{
    // if not -> allocation will be done, but after that dst will not point to page locked memory
    CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() );
    devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost);
}
Beispiel #6
0
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());

    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));

    const Scalar zero = Scalar::all(0);

    cudaStream_t cudaStream = StreamAccessor::getStream(stream);

    ////////////////////////////////////////////////////////////////////////////////////////////
    // Init

    int rows = left.rows;
    int cols = left.cols;

    rthis.levels = std::min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
    int levels = rthis.levels;

    // compute sizes
    AutoBuffer<int> buf(levels * 3);
    int* cols_pyr = buf;
    int* rows_pyr = cols_pyr + levels;
    int* nr_plane_pyr = rows_pyr + levels;

    cols_pyr[0]     = cols;
    rows_pyr[0]     = rows;
    nr_plane_pyr[0] = rthis.nr_plane;

    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i]     = cols_pyr[i-1] / 2;
        rows_pyr[i]     = rows_pyr[i-1] / 2;
        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
    }


    GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;


    //allocate buffers
    int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
    buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
    buffers_count += 1; //  data_cost_selected
    mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type);

    data_cost          = mbuf.rowRange(0, rows * rthis.nr_plane * 2);
    data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane);

    for(int k = 0; k < 2; ++k) // in/out
    {
        GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows);
        GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);

        GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
        for(int _r = 0; _r < 5; ++_r)
        {
            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
            assert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
        }
    };

    size_t elem_step = mbuf.step / sizeof(T);

    Size temp_size = data_cost.size();
    if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)
        temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp);

    temp.create(temp_size, DataType<T>::type);

    ////////////////////////////////////////////////////////////////////////////
    // Compute

    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    if (stream)
    {
        stream.enqueueMemSet(l[0], zero);
        stream.enqueueMemSet(d[0], zero);
        stream.enqueueMemSet(r[0], zero);
        stream.enqueueMemSet(u[0], zero);

        stream.enqueueMemSet(l[1], zero);
        stream.enqueueMemSet(d[1], zero);
        stream.enqueueMemSet(r[1], zero);
        stream.enqueueMemSet(u[1], zero);

        stream.enqueueMemSet(data_cost, zero);
        stream.enqueueMemSet(data_cost_selected, zero);
    }
    else
    {
        l[0].setTo(zero);
        d[0].setTo(zero);
        r[0].setTo(zero);
        u[0].setTo(zero);

        l[1].setTo(zero);
        d[1].setTo(zero);
        r[1].setTo(zero);
        u[1].setTo(zero);

        data_cost.setTo(zero);
        data_cost_selected.setTo(zero);
    }

    int cur_idx = 0;

    for (int i = levels - 1; i >= 0; i--)
    {
        if (i == levels - 1)
        {
            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
        }
        else
        {
            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step,
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);

            int new_idx = (cur_idx + 1) & 1;

            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i],
                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);

            cur_idx = new_idx;
        }

        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step,
                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
    }

    if (disp.empty())
        disp.create(rows, cols, CV_16S);

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

    if (stream)
        stream.enqueueMemSet(out, zero);
    else
        out.setTo(zero);

    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
        if (stream)
            stream.enqueueConvert(out, disp, disp.type());
        else
            out.convertTo(disp, disp.type());
    }
}
Beispiel #7
0
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
{
    using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;

    typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);

    static Caller multipass_callers[] =
    {
        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
    };

    static Caller singlepass_callers[] =
    {
        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
    };

    static MaskedCaller masked_multipass_callers[] =
    {
        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
    };

    static MaskedCaller masked_singlepass_callers[] =
    {
        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
    };

    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));

    if (src.depth() == CV_64F)
    {
        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

    double minVal_;
    if (!minVal) minVal = &minVal_;
    double maxVal_;
    if (!maxVal) maxVal = &maxVal_;
    int minLoc_[2];
    int maxLoc_[2];

    Size valbuf_size, locbuf_size;
    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
                       valbuf_size.height, locbuf_size.width, locbuf_size.height);
    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);

    if (mask.empty())
    {
        Caller* callers = multipass_callers;
        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
            callers = singlepass_callers;

        Caller caller = callers[src.type()];
        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
    else
    {
        MaskedCaller* callers = masked_multipass_callers;
        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
            callers = masked_singlepass_callers;

        MaskedCaller caller = callers[src.type()];
        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }

    if (minLoc) {
        minLoc->x = minLoc_[0];
        minLoc->y = minLoc_[1];
    }
    if (maxLoc) {
        maxLoc->x = maxLoc_[0];
        maxLoc->y = maxLoc_[1];
    }
}
Beispiel #8
0
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
{
    class LevelsInit
    {
    public:
        Npp32s pLevels[256];
        const Npp32s* pLevels3[3];
        int nValues3[3];

#if (CUDA_VERSION > 4020)
        GpuMat d_pLevels;
#endif

        LevelsInit()
        {
            nValues3[0] = nValues3[1] = nValues3[2] = 256;
            for (int i = 0; i < 256; ++i)
                pLevels[i] = i;


#if (CUDA_VERSION <= 4020)
            pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
#else
            d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
            pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
#endif
        }
    };
    static LevelsInit lvls;

    int cn = src.channels();

    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
    CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous());

    dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));

    NppiSize sz;
    sz.height = src.rows;
    sz.width = src.cols;

    Mat nppLut;
    lut.convertTo(nppLut, CV_32S);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    if (src.type() == CV_8UC1)
    {
#if (CUDA_VERSION <= 4020)
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
#else
        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
#endif
    }
    else
    {
        const Npp32s* pValues3[3];

        Mat nppLut3[3];
        if (nppLut.channels() == 1)
        {
#if (CUDA_VERSION <= 4020)
            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
#else
            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
#endif
        }
        else
        {
            cv::split(nppLut, nppLut3);

#if (CUDA_VERSION <= 4020)
            pValues3[0] = nppLut3[0].ptr<Npp32s>();
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
#else
            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));

            pValues3[0] = d_nppLut0.ptr<Npp32s>();
            pValues3[1] = d_nppLut1.ptr<Npp32s>();
            pValues3[2] = d_nppLut2.ptr<Npp32s>();
#endif
        }

        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
{
    CV_Assert(M.rows == 3 && M.cols == 3);

    int interpolation = flags & INTER_MAX;

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    static const bool useNppTab[6][4][3] =
    {
        {
            {false, false, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        }
    };

    bool useNpp = borderMode == BORDER_CONSTANT;
    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
    #ifdef linux
        // NPP bug on float data
        useNpp = useNpp && src.depth() != CV_32F;
    #endif

    if (useNpp)
    {
        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
            {
                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
            },
            {
                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
            }
        };

        double coeffs[3][3];
        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());

        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
    }
    else
    {
        using namespace cv::gpu::device::imgproc;

        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, int cc);

        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

        dst.create(dsize, src.type());

        float coeffs[3 * 3];
        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);

        if (flags & WARP_INVERSE_MAP)
            M.convertTo(coeffsMat, coeffsMat.type());
        else
        {
            cv::Mat iM;
            invert(M, iM);
            iM.convertTo(coeffsMat, coeffsMat.type());
        }

        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

        DeviceInfo info;
        int cc = info.majorVersion() * 10 + info.minorVersion();

        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
    }
}
Beispiel #10
0
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR
            || interpolation == INTER_CUBIC || interpolation == INTER_AREA);
    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));

    if (dsize == Size())
        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
    else
    {
        fx = static_cast<double>(dsize.width) / src.cols;
        fy = static_cast<double>(dsize.height) / src.rows;
    }
    if (dsize != dst.size())
        dst.create(dsize, src.type());

    if (dsize == src.size())
    {
        if (s)
            s.enqueueCopy(src, dst);
        else
            src.copyTo(dst);
        return;
    }

    cudaStream_t stream = StreamAccessor::getStream(s);

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || (src.type() == CV_8UC4 && interpolation != INTER_AREA));

    if (useNpp)
    {
        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
                                    double xFactor, double yFactor, int eInterpolation);

        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };

        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};

        NppiSize srcsz;
        srcsz.width  = wholeSize.width;
        srcsz.height = wholeSize.height;

        NppiRect srcrect;
        srcrect.x = ofs.x;
        srcrect.y = ofs.y;
        srcrect.width  = src.cols;
        srcrect.height = src.rows;

        NppiSize dstsz;
        dstsz.width  = dst.cols;
        dstsz.height = dst.rows;

        NppStreamHandler h(stream);

        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
    else
    {
        using namespace ::cv::gpu::device::imgproc;

        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);

        static const func_t funcs[6][4] =
        {
            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
        };

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
    }
}
Beispiel #11
0
void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
{
#ifndef HAVE_CUFFT
    (void) _src;
    (void) _dst;
    (void) dft_size;
    (void) flags;
    (void) stream;
    throw_no_cuda();
#else
    GpuMat src = _src.getGpuMat();

    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );

    // We don't support unpacked output (in the case of real input)
    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );

    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
    const bool is_complex_input  = src.channels() == 2;
    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);

    // We don't support real-to-real transform
    CV_Assert( is_complex_input || is_complex_output );

    GpuMat src_cont = src;

    // Make sure here we work with the continuous input,
    // as CUFFT can't handle gaps
    createContinuous(src.rows, src.cols, src.type(), src_cont);
    if (src_cont.data != src.data)
        src.copyTo(src_cont, stream);

    Size dft_size_opt = dft_size;
    if (is_1d_input && !is_row_dft)
    {
        // If the source matrix is single column handle it as single row
        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
    }

    CV_Assert( dft_size_opt.width > 1 );

    cufftType dft_type = CUFFT_R2C;
    if (is_complex_input)
        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;

    cufftHandle plan;
    if (is_1d_input || is_row_dft)
        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
    else
        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );

    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );

    if (is_complex_input)
    {
        if (is_complex_output)
        {
            createContinuous(dft_size, CV_32FC2, _dst);
            GpuMat dst = _dst.getGpuMat();

            cufftSafeCall(cufftExecC2C(
                              plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
                              is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
        }
        else
        {
            createContinuous(dft_size, CV_32F, _dst);
            GpuMat dst = _dst.getGpuMat();

            cufftSafeCall(cufftExecC2R(
                              plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
        }
    }
    else
    {
        // We could swap dft_size for efficiency. Here we must reflect it
        if (dft_size == dft_size_opt)
            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
        else
            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);

        GpuMat dst = _dst.getGpuMat();

        cufftSafeCall(cufftExecR2C(
                          plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
    }

    cufftSafeCall( cufftDestroy(plan) );

    if (is_scaled_dft)
        cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);

#endif
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{
    using namespace mathfunc::minmax;

    typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);

    static Caller multipass_callers[7] = { 
            minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, 
            minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, 
            minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 };

    static Caller singlepass_callers[7] = { 
            minMaxCaller<unsigned char>, minMaxCaller<char>, 
            minMaxCaller<unsigned short>, minMaxCaller<short>, 
            minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> };

    static MaskedCaller masked_multipass_callers[7] = { 
            minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, 
            minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
            minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 };

    static MaskedCaller masked_singlepass_callers[7] = { 
            minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, 
            minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, 
            minMaxMaskCaller<int>, minMaxMaskCaller<float>, 
            minMaxMaskCaller<double> };

    CV_Assert(src.channels() == 1);

    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));

    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && 
                                       DeviceInfo().supports(NATIVE_DOUBLE)));

    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    if (mask.empty())
    {
        Caller* callers = multipass_callers;
        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
            callers = singlepass_callers;

        Caller caller = callers[src.type()];
        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
        caller(src, minVal, maxVal, buf);
    }
    else
    {
        MaskedCaller* callers = masked_multipass_callers;
        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
            callers = masked_singlepass_callers;

        MaskedCaller caller = callers[src.type()];
        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
        caller(src, mask, minVal, maxVal, buf);
    }
}
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
{
    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
}
Beispiel #14
0
Mat visionUtils::skinDetect(Mat captureframe, Mat3b *skinDetectHSV, Mat *skinMask, std::vector<int> adaptiveHSV, int minPixelSize, int imgBlurPixels, int imgMorphPixels, int singleRegionChoice, bool displayFaces)
{

    if (adaptiveHSV.size()!=6 || adaptiveHSV.empty())
    {
        adaptiveHSV.clear();
        adaptiveHSV.push_back(5);
        adaptiveHSV.push_back(38);
        adaptiveHSV.push_back(51);
        adaptiveHSV.push_back(17);
        adaptiveHSV.push_back(250);
        adaptiveHSV.push_back(242);
    }


    //int step = 0;
    Mat3b frameTemp;
    Mat3b frame;
    // Forcing resize to 640x480 -> all thresholds / pixel filters configured for this size.....
    // Note returned to original size at end...
    Size s = captureframe.size();
    cv::resize(captureframe,captureframe,Size(640,480));



    if (useGPU)
    {
        GpuMat imgGPU, imgGPUHSV;
        imgGPU.upload(captureframe);
        cv::cvtColor(imgGPU, imgGPUHSV, CV_BGR2HSV);
        GaussianBlur(imgGPUHSV, imgGPUHSV, Size(imgBlurPixels,imgBlurPixels), 1, 1);
        imgGPUHSV.download(frameTemp);
    }
    else
    {
        cv::cvtColor(captureframe, frameTemp, CV_BGR2HSV);
        GaussianBlur(frameTemp, frameTemp, Size(imgBlurPixels,imgBlurPixels), 1, 1);
    }

    // Potential FASTER VERSION using inRange
    Mat frameThreshold = Mat::zeros(frameTemp.rows,frameTemp.cols, CV_8UC1);
    Mat hsvMin = (Mat_<int>(1,3) << adaptiveHSV[0], adaptiveHSV[1],adaptiveHSV[2] );
    Mat hsvMax = (Mat_<int>(1,3) << adaptiveHSV[3], adaptiveHSV[4],adaptiveHSV[5] );
    inRange(frameTemp,hsvMin ,hsvMax, frameThreshold);
    frameTemp.copyTo(frame,frameThreshold);

    /* BGR CONVERSION AND THRESHOLD */
    Mat1b frame_gray;

    // send HSV to skinDetectHSV for return
    *skinDetectHSV=frame.clone();

    cv::cvtColor(frame, frame_gray, CV_BGR2GRAY);


    // Adaptive thresholding technique
    // 1. Threshold data to find main areas of skin
    adaptiveThreshold(frame_gray,frame_gray,255,ADAPTIVE_THRESH_GAUSSIAN_C,THRESH_BINARY_INV,9,1);


    if (useGPU)
    {
        GpuMat imgGPU;
        imgGPU.upload(frame_gray);
        // 2. Fill in thresholded areas
#if CV_MAJOR_VERSION == 2
        gpu::morphologyEx(imgGPU, imgGPU, CV_MOP_CLOSE, Mat1b(imgMorphPixels,imgMorphPixels,1), Point(-1, -1), 2);
        gpu::GaussianBlur(imgGPU, imgGPU, Size(imgBlurPixels,imgBlurPixels), 1, 1);
#elif CV_MAJOR_VERSION == 3
        //TODO: Check if that's correct
        Mat element = getStructuringElement(MORPH_RECT, Size(imgMorphPixels, imgMorphPixels), Point(-1, -1));
        Ptr<cuda::Filter> closeFilter = cuda::createMorphologyFilter(MORPH_CLOSE, imgGPU.type(), element, Point(-1, -1), 2);
        closeFilter->apply(imgGPU, imgGPU);
        cv::Ptr<cv::cuda::Filter> gaussianFilter = cv::cuda::createGaussianFilter(imgGPU.type(), imgGPU.type(), Size(imgMorphPixels, imgMorphPixels), 1, 1);
        gaussianFilter->apply(imgGPU, imgGPU);
#endif

        imgGPU.download(frame_gray);
    }
    else
    {
        // 2. Fill in thresholded areas
        morphologyEx(frame_gray, frame_gray, CV_MOP_CLOSE, Mat1b(imgMorphPixels,imgMorphPixels,1), Point(-1, -1), 2);
        GaussianBlur(frame_gray, frame_gray, Size(imgBlurPixels,imgBlurPixels), 1, 1);
        // Select single largest region from image, if singleRegionChoice is selected (1)
    }


    if (singleRegionChoice)
    {
        *skinMask = cannySegmentation(frame_gray, -1, displayFaces);
    }
    else // Detect each separate block and remove blobs smaller than a few pixels
    {
        *skinMask = cannySegmentation(frame_gray, minPixelSize, displayFaces);
    }

    // Just return skin
    Mat frame_skin;
    captureframe.copyTo(frame_skin,*skinMask);  // Copy captureframe data to frame_skin, using mask from frame_ttt
    // Resize image to original before return
    cv::resize(frame_skin,frame_skin,s);

    if (displayFaces)
    {
        imshow("Skin HSV (B)",frame);
        imshow("Adaptive_threshold (D1)",frame_gray);
        imshow("Skin segmented",frame_skin);
    }

    return frame_skin;
    waitKey(1);
}
Beispiel #15
0
void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& train,
    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
    const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                             cudaStream_t stream);

    static const caller_t callersL1[] =
    {
        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
        matchL1_gpu<int>, matchL1_gpu<float>
    };
    static const caller_t callersL2[] =
    {
        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
    };
    static const caller_t callersHamming[] =
    {
        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
    };

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);

    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    if (k == 2)
    {
        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
    }
    else
    {
        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
    }

    if (stream)
        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
    else
        trainIdx.setTo(Scalar::all(-1));

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);

    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
}
Beispiel #16
0
void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask)
{
    using namespace cv::gpu::device::gfft;

    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));

    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    ensureSizeIsEnough(image.size(), CV_32F, eig_);

    if (useHarrisDetector)
        cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK);
    else
        cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3);

    double maxVal = 0;
    minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_);

    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);

    int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);

    if (total == 0)
    {
        corners.release();
        return;
    }

    sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);

    if (minDistance < 1)
        tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners);
    else
    {
        vector<Point2f> tmp(total);
        Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
        tmpCorners_.colRange(0, total).download(tmpMat);

        vector<Point2f> tmp2;
        tmp2.reserve(total);

        const int cell_size = cvRound(minDistance);
        const int grid_width = (image.cols + cell_size - 1) / cell_size;
        const int grid_height = (image.rows + cell_size - 1) / cell_size;

        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);

        for (int i = 0; i < total; ++i)
        {
            Point2f p = tmp[i];

            bool good = true;

            int x_cell = static_cast<int>(p.x / cell_size);
            int y_cell = static_cast<int>(p.y / cell_size);

            int x1 = x_cell - 1;
            int y1 = y_cell - 1;
            int x2 = x_cell + 1;
            int y2 = y_cell + 1;

            // boundary check
            x1 = std::max(0, x1);
            y1 = std::max(0, y1);
            x2 = std::min(grid_width - 1, x2);
            y2 = std::min(grid_height - 1, y2);

            for (int yy = y1; yy <= y2; yy++)
            {
                for (int xx = x1; xx <= x2; xx++)
                {
                    vector<Point2f>& m = grid[yy * grid_width + xx];

                    if (!m.empty())
                    {
                        for(size_t j = 0; j < m.size(); j++)
                        {
                            float dx = p.x - m[j].x;
                            float dy = p.y - m[j].y;

                            if (dx * dx + dy * dy < minDistance * minDistance)
                            {
                                good = false;
                                goto break_out;
                            }
                        }
                    }
                }
            }

            break_out:

            if(good)
            {
                grid[y_cell * grid_width + x_cell].push_back(p);

                tmp2.push_back(p);

                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
                    break;
            }
        }

        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
    }
}
Beispiel #17
0
double cv::gpu::norm(const GpuMat& src1, int normType)
{
    return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType);
}
Beispiel #18
0
void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
{
    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
    CV_Assert( I0.size() == I1.size() );
    CV_Assert( I0.type() == I1.type() );
    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
    CV_Assert( nscales > 0 );

    // allocate memory for the pyramid structure
    I0s.resize(nscales);
    I1s.resize(nscales);
    u1s.resize(nscales);
    u2s.resize(nscales);
    u3s.resize(nscales);

    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);

    if (!useInitialFlow)
    {
        flowx.create(I0.size(), CV_32FC1);
        flowy.create(I0.size(), CV_32FC1);
    }

    u1s[0] = flowx;
    u2s[0] = flowy;
    if (gamma)
        u3s[0].create(I0.size(), CV_32FC1);

    I1x_buf.create(I0.size(), CV_32FC1);
    I1y_buf.create(I0.size(), CV_32FC1);

    I1w_buf.create(I0.size(), CV_32FC1);
    I1wx_buf.create(I0.size(), CV_32FC1);
    I1wy_buf.create(I0.size(), CV_32FC1);

    grad_buf.create(I0.size(), CV_32FC1);
    rho_c_buf.create(I0.size(), CV_32FC1);

    p11_buf.create(I0.size(), CV_32FC1);
    p12_buf.create(I0.size(), CV_32FC1);
    p21_buf.create(I0.size(), CV_32FC1);
    p22_buf.create(I0.size(), CV_32FC1);
    if (gamma)
    {
        p31_buf.create(I0.size(), CV_32FC1);
        p32_buf.create(I0.size(), CV_32FC1);
    }
    diff_buf.create(I0.size(), CV_32FC1);

    // create the scales
    for (int s = 1; s < nscales; ++s)
    {
        cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
        cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);

        if (I0s[s].cols < 16 || I0s[s].rows < 16)
        {
            nscales = s;
            break;
        }

        if (useInitialFlow)
        {
            cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
            cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);

            cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
            cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
        }
        else
        {
            u1s[s].create(I0s[s].size(), CV_32FC1);
            u2s[s].create(I0s[s].size(), CV_32FC1);
        }
        if (gamma)
            u3s[s].create(I0s[s].size(), CV_32FC1);
    }

    if (!useInitialFlow)
    {
        u1s[nscales-1].setTo(Scalar::all(0));
        u2s[nscales-1].setTo(Scalar::all(0));
    }
    if (gamma)
        u3s[nscales - 1].setTo(Scalar::all(0));

    // pyramidal structure for computing the optical flow
    for (int s = nscales - 1; s >= 0; --s)
    {
        // compute the optical flow at the current scale
        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s]);

        // if this was the last scale, finish now
        if (s == 0)
            break;

        // otherwise, upsample the optical flow

        // zoom the optical flow for the next finer scale
        cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
        cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
        if (gamma)
            cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size());

        // scale the optical flow with the appropriate zoom factor
        cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
        cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
    }
}
Beispiel #19
0
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{
#ifndef HAVE_CUBLAS
    (void)src1;
    (void)src2;
    (void)alpha;
    (void)src3;
    (void)beta;
    (void)dst;
    (void)flags;
    (void)stream;
    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
#else
    // CUBLAS works with column-major matrices

    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));

    if (src1.depth() == CV_64F)
    {
        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

    bool tr1 = (flags & GEMM_1_T) != 0;
    bool tr2 = (flags & GEMM_2_T) != 0;
    bool tr3 = (flags & GEMM_3_T) != 0;

    if (src1.type() == CV_64FC2)
    {
        if (tr1 || tr2 || tr3)
            CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
    }

    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
    Size dstSize(src2Size.width, src1Size.height);

    CV_Assert(src1Size.width == src2Size.height);
    CV_Assert(src3.empty() || src3Size == dstSize);

    dst.create(dstSize, src1.type());

    if (beta != 0)
    {
        if (src3.empty())
        {
            if (stream)
                stream.enqueueMemSet(dst, Scalar::all(0));
            else
                dst.setTo(Scalar::all(0));
        }
        else
        {
            if (tr3)
            {
                transpose(src3, dst, stream);
            }
            else
            {
                if (stream)
                    stream.enqueueCopy(src3, dst);
                else
                    src3.copyTo(dst);
            }
        }
    }

    cublasHandle_t handle;
    cublasSafeCall( cublasCreate_v2(&handle) );

    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );

    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );

    const float alphaf = static_cast<float>(alpha);
    const float betaf = static_cast<float>(beta);

    const cuComplex alphacf = make_cuComplex(alphaf, 0);
    const cuComplex betacf = make_cuComplex(betaf, 0);

    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);

    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;

    switch (src1.type())
    {
    case CV_32FC1:
        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphaf,
            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
            &betaf,
            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
        break;

    case CV_64FC1:
        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alpha,
            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
            &beta,
            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
        break;

    case CV_32FC2:
        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphacf,
            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
            &betacf,
            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
        break;

    case CV_64FC2:
        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphac,
            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
            &betac,
            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
        break;
    }

    cublasSafeCall( cublasDestroy_v2(handle) );
#endif
}
Beispiel #20
0
void cv::cuda::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
{
    CV_Assert( prev.type() == CV_8UC1 );
    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );

    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);

    velx.create(velSize, CV_32FC1);
    vely.create(velSize, CV_32FC1);

    // scanning scheme coordinates
    std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
    int ssCount = 0;

    // Calculate scanning scheme
    const int minCount = std::min(maxRange.width, maxRange.height);

    // use spiral search pattern
    //
    //     9 10 11 12
    //     8  1  2 13
    //     7  *  3 14
    //     6  5  4 15
    //... 20 19 18 17
    //

    for (int i = 0; i < minCount; ++i)
    {
        // four cycles along sides
        int x = -i - 1, y = x;

        // upper side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) ++x;
            ss[ssCount].y = (short) y;
        }

        // right side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) x;
            ss[ssCount].y = (short) ++y;
        }

        // bottom side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) --x;
            ss[ssCount].y = (short) y;
        }

        // left side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) x;
            ss[ssCount].y = (short) --y;
        }
    }

    // the rest part
    if (maxRange.width < maxRange.height)
    {
        const int xleft = -minCount;

        // cycle by neighbor rings
        for (int i = minCount; i < maxRange.height; ++i)
        {
            // two cycles by x
            int y = -(i + 1);
            int x = xleft;

            // upper side
            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }

            x = xleft;
            y = -y;

            // bottom side
            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }
        }
    }
    else if (maxRange.width > maxRange.height)
    {
        const int yupper = -minCount;

        // cycle by neighbor rings
        for (int i = minCount; i < maxRange.width; ++i)
        {
            // two cycles by y
            int x = -(i + 1);
            int y = yupper;

            // left side
            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }

            y = yupper;
            x = -x;

            // right side
            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }
        }
    }

    const cudaStream_t stream = StreamAccessor::getStream(st);

    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
    if (stream == 0)
        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
    else
        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );

    const int maxX = prev.cols - blockSize.width;
    const int maxY = prev.rows - blockSize.height;

    const int SMALL_DIFF = 2;
    const int BIG_DIFF = 128;

    const int blSize = blockSize.area();
    const int acceptLevel = blSize * SMALL_DIFF;
    const int escapeLevel = blSize * BIG_DIFF;

    optflowbm::calc(prev, curr, velx, vely,
                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
}
Beispiel #21
0
void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
{
    using namespace cv::gpu::device::pyrlk;

    if (prevPts.empty())
    {
        nextPts.release();
        status.release();
        if (err) err->release();
        return;
    }

    dim3 block, patch;
    calcPatchSize(winSize, block, patch, isDeviceArch11_);

    CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4);
    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
    CV_Assert(maxLevel >= 0);
    CV_Assert(winSize.width > 2 && winSize.height > 2);
    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);

    if (useInitialFlow)
        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
    else
        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);

    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
    GpuMat temp2 = nextPts.reshape(1);
    multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);

    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
    status.setTo(Scalar::all(1));

    if (err)
        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);

    // build the image pyramids.

    prevPyr_.resize(maxLevel + 1);
    nextPyr_.resize(maxLevel + 1);

    int cn = prevImg.channels();

    if (cn == 1 || cn == 4)
    {
        prevImg.convertTo(prevPyr_[0], CV_32F);
        nextImg.convertTo(nextPyr_[0], CV_32F);
    }
    else
    {
        cvtColor(prevImg, dx_calcBuf_, COLOR_BGR2BGRA);
        dx_calcBuf_.convertTo(prevPyr_[0], CV_32F);

        cvtColor(nextImg, dx_calcBuf_, COLOR_BGR2BGRA);
        dx_calcBuf_.convertTo(nextPyr_[0], CV_32F);
    }

    for (int level = 1; level <= maxLevel; ++level)
    {
        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
    }

    loadConstants(make_int2(winSize.width, winSize.height), iters);

    for (int level = maxLevel; level >= 0; level--)
    {
        if (cn == 1)
        {
            lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
        else
        {
            lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
    }
}
Beispiel #22
0
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
{
#if (CUDA_VERSION < 5000)
    CV_Assert(terminals.type() == CV_32S);
#else
    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
#endif

    Size src_size = terminals.size();

    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(leftTransp.type() == terminals.type());

    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(rightTransp.type() == terminals.type());

    CV_Assert(top.size() == src_size);
    CV_Assert(top.type() == terminals.type());

    CV_Assert(topLeft.size() == src_size);
    CV_Assert(topLeft.type() == terminals.type());

    CV_Assert(topRight.size() == src_size);
    CV_Assert(topRight.type() == terminals.type());

    CV_Assert(bottom.size() == src_size);
    CV_Assert(bottom.type() == terminals.type());

    CV_Assert(bottomLeft.size() == src_size);
    CV_Assert(bottomLeft.type() == terminals.type());

    CV_Assert(bottomRight.size() == src_size);
    CV_Assert(bottomRight.type() == terminals.type());

    labels.create(src_size, CV_8U);

    NppiSize sznpp;
    sznpp.width = src_size.width;
    sznpp.height = src_size.height;

    int bufsz;
    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );

    ensureSizeIsEnough(1, bufsz, CV_8U, buf);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);

#if (CUDA_VERSION < 5000)
    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
#else
    if (terminals.type() == CV_32S)
    {
        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    }
    else
    {
        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    }
#endif

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
    const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };

    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));

    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
    }

    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
        nMatches.setTo(Scalar::all(0));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
}
void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
{
    CV_Assert(TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12));

    CV_Assert(src.type() == CV_8UC4);
    const int nrows = src.rows;
    const int ncols = src.cols;
    const int hr = sr;
    const int hsp = sp;

    // Perform mean shift procedure and obtain region and spatial maps
    GpuMat h_rmap, h_spmap;
    meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria);
    Mat rmap = h_rmap;
    Mat spmap = h_spmap;

    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
                                        + (nrows - 1) + (ncols - 1));

    // Make region adjacent graph from image
    Vec4b r1;
    Vec4b r2[4];
    Vec2s sp1;
    Vec2s sp2[4];
    int dr[4];
    int dsp[4];
    for (int y = 0; y < nrows - 1; ++y)
    {
        Vec4b* ry = rmap.ptr<Vec4b>(y);
        Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
        Vec2s* spy = spmap.ptr<Vec2s>(y);
        Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
        for (int x = 0; x < ncols - 1; ++x)
        {
            r1 = ry[x];
            sp1 = spy[x];

            r2[0] = ry[x + 1];
            r2[1] = ryp[x];
            r2[2] = ryp[x + 1];
            r2[3] = ryp[x];

            sp2[0] = spy[x + 1];
            sp2[1] = spyp[x];
            sp2[2] = spyp[x + 1];
            sp2[3] = spyp[x];

            dr[0] = dist2(r1, r2[0]);
            dr[1] = dist2(r1, r2[1]);
            dr[2] = dist2(r1, r2[2]);
            dsp[0] = dist2(sp1, sp2[0]);
            dsp[1] = dist2(sp1, sp2[1]);
            dsp[2] = dist2(sp1, sp2[2]);

            r1 = ry[x + 1];
            sp1 = spy[x + 1];

            dr[3] = dist2(r1, r2[3]);
            dsp[3] = dist2(sp1, sp2[3]);

            g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
            g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
            g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
            g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
        }
    }
    for (int y = 0; y < nrows - 1; ++y)
    {
        r1 = rmap.at<Vec4b>(y, ncols - 1);
        r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
        sp1 = spmap.at<Vec2s>(y, ncols - 1);
        sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
        dr[0] = dist2(r1, r2[0]);
        dsp[0] = dist2(sp1, sp2[0]);
        g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
    }
    for (int x = 0; x < ncols - 1; ++x)
    {
        r1 = rmap.at<Vec4b>(nrows - 1, x);
        r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
        sp1 = spmap.at<Vec2s>(nrows - 1, x);
        sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
        dr[0] = dist2(r1, r2[0]);
        dsp[0] = dist2(sp1, sp2[0]);
        g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
    }

    DjSets comps(g.numv);

    // Find adjacent components
    for (int v = 0; v < g.numv; ++v)
    {
        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
        {
            int c1 = comps.find(v);
            int c2 = comps.find(g.edges[e_it].to);
            if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
                comps.merge(c1, c2);
        }
    }

    vector<SegmLink> edges;
    edges.reserve(g.numv);

    // Prepare edges connecting differnet components
    for (int v = 0; v < g.numv; ++v)
    {
        int c1 = comps.find(v);
        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
        {
            int c2 = comps.find(g.edges[e_it].to);
            if (c1 != c2)
                edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
        }
    }

    // Sort all graph's edges connecting differnet components (in asceding order)
    sort(edges.begin(), edges.end());

    // Exclude small components (starting from the nearest couple)
    for (size_t i = 0; i < edges.size(); ++i)
    {
        int c1 = comps.find(edges[i].from);
        int c2 = comps.find(edges[i].to);
        if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
            comps.merge(c1, c2);
    }

    // Compute sum of the pixel's colors which are in the same segment
    Mat h_src = src;
    vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
    for (int y = 0; y < nrows; ++y)
    {
        Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
        for (int x = 0; x < ncols; ++x)
        {
            int parent = comps.find(pix(y, x, ncols));
            Vec4b col = h_srcy[x];
            Vec4i& sumcol = sumcols[parent];
            sumcol[0] += col[0];
            sumcol[1] += col[1];
            sumcol[2] += col[2];
        }
    }

    // Create final image, color of each segment is the average color of its pixels
    dst.create(src.size(), src.type());

    for (int y = 0; y < nrows; ++y)
    {
        Vec4b* dsty = dst.ptr<Vec4b>(y);
        for (int x = 0; x < ncols; ++x)
        {
            int parent = comps.find(pix(y, x, ncols));
            const Vec4i& sumcol = sumcols[parent];
            Vec4b& dstcol = dsty[x];
            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
        }
    }
}
Beispiel #25
0
void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2)
{
    using namespace tvl1flow;

    const double scaledEpsilon = epsilon * epsilon * I0.size().area();

    CV_DbgAssert( I1.size() == I0.size() );
    CV_DbgAssert( I1.type() == I0.type() );
    CV_DbgAssert( u1.size() == I0.size() );
    CV_DbgAssert( u2.size() == u1.size() );

    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
    centeredGradient(I1, I1x, I1y);

    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));

    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));

    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
    p11.setTo(Scalar::all(0));
    p12.setTo(Scalar::all(0));
    p21.setTo(Scalar::all(0));
    p22.setTo(Scalar::all(0));

    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));

    const float l_t = static_cast<float>(lambda * theta);
    const float taut = static_cast<float>(tau / theta);

    for (int warpings = 0; warpings < warps; ++warpings)
    {
        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);

        double error = std::numeric_limits<double>::max();
        double prevError = 0.0;
        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
        {
            // some tweaks to make sum operation less frequently
            bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);

            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta), calcError);

            if (calcError)
            {
                error = gpu::sum(diff, norm_buf)[0];
                prevError = error;
            }
            else
            {
                error = std::numeric_limits<double>::max();
                prevError -= scaledEpsilon;
            }

            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
        }
    }
}
Beispiel #26
0
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
                          GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
                          GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());

    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));

    const Scalar zero = Scalar::all(0);

    cudaStream_t cudaStream = StreamAccessor::getStream(stream);

    ////////////////////////////////////////////////////////////////////////////////////////////
    // Init

    int rows = left.rows;
    int cols = left.cols;

    rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
    int levels = rthis.levels;

    AutoBuffer<int> buf(levels * 4);

    int* cols_pyr = buf;
    int* rows_pyr = cols_pyr + levels;
    int* nr_plane_pyr = rows_pyr + levels;
    int* step_pyr = nr_plane_pyr + levels;

    cols_pyr[0] = cols;
    rows_pyr[0] = rows;
    nr_plane_pyr[0] = rthis.nr_plane;

    const int n = 64;
    step_pyr[0] = static_cast<int>(alignSize(cols * sizeof(T), n) / sizeof(T));
    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i] = (cols_pyr[i-1] + 1) / 2;
        rows_pyr[i] = (rows_pyr[i-1] + 1) / 2;

        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;

        step_pyr[i] = static_cast<int>(alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T));
    }

    Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
    Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);

    u[0].create(msg_size, DataType<T>::type);
    d[0].create(msg_size, DataType<T>::type);
    l[0].create(msg_size, DataType<T>::type);
    r[0].create(msg_size, DataType<T>::type);

    u[1].create(msg_size, DataType<T>::type);
    d[1].create(msg_size, DataType<T>::type);
    l[1].create(msg_size, DataType<T>::type);
    r[1].create(msg_size, DataType<T>::type);

    disp_selected_pyr[0].create(msg_size, DataType<T>::type);
    disp_selected_pyr[1].create(msg_size, DataType<T>::type);

    data_cost.create(data_cost_size, DataType<T>::type);
    data_cost_selected.create(msg_size, DataType<T>::type);

    step_pyr[0] = static_cast<int>(data_cost.step / sizeof(T));

    Size temp_size = data_cost_size;
    if (data_cost_size.width * data_cost_size.height < step_pyr[levels - 1] * rows_pyr[levels - 1] * rthis.ndisp)
        temp_size = Size(step_pyr[levels - 1], rows_pyr[levels - 1] * rthis.ndisp);

    temp.create(temp_size, DataType<T>::type);

    ////////////////////////////////////////////////////////////////////////////
    // Compute

    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    if (stream)
    {
        stream.enqueueMemSet(l[0], zero);
        stream.enqueueMemSet(d[0], zero);
        stream.enqueueMemSet(r[0], zero);
        stream.enqueueMemSet(u[0], zero);
        
        stream.enqueueMemSet(l[1], zero);
        stream.enqueueMemSet(d[1], zero);
        stream.enqueueMemSet(r[1], zero);
        stream.enqueueMemSet(u[1], zero);

        stream.enqueueMemSet(data_cost, zero);
        stream.enqueueMemSet(data_cost_selected, zero);
    }
    else
    {
        l[0].setTo(zero);
        d[0].setTo(zero);
        r[0].setTo(zero);
        u[0].setTo(zero);

        l[1].setTo(zero);
        d[1].setTo(zero);
        r[1].setTo(zero);
        u[1].setTo(zero);

        data_cost.setTo(zero);
        data_cost_selected.setTo(zero);
    }

    int cur_idx = 0;

    for (int i = levels - 1; i >= 0; i--)
    {
        if (i == levels - 1)
        {
            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
        }
        else
        {
            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);

            int new_idx = (cur_idx + 1) & 1;

            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);

            cur_idx = new_idx;
        }

        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
    }

    if (disp.empty())
        disp.create(rows, cols, CV_16S);

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

    if (stream)
        stream.enqueueMemSet(out, zero);
    else
        out.setTo(zero);

    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
        if (stream)
            stream.enqueueConvert(out, disp, disp.type());
        else
            out.convertTo(disp, disp.type());
    }
}