Beispiel #1
0
void cv::cuda::BFMatcher_CUDA::knnMatchSingle(const GpuMat& query, const GpuMat& train,
    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
    const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace cv::cuda::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                             cudaStream_t stream);

    static const caller_t callersL1[] =
    {
        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
        matchL1_gpu<int>, matchL1_gpu<float>
    };
    static const caller_t callersL2[] =
    {
        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
    };
    static const caller_t callersHamming[] =
    {
        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
    };

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);

    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    if (k == 2)
    {
        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
    }
    else
    {
        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
    }

    trainIdx.setTo(Scalar::all(-1), stream);

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);

    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
}
Beispiel #2
0
void cv::cuda::rotate(InputArray _src, OutputArray _dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
{
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
    static const func_t funcs[6][4] =
    {
        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
        {0,0,0,0},
        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
        {0,0,0,0},
        {0,0,0,0},
        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
    };

    GpuMat src = _src.getGpuMat();

    CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );

    _dst.create(dsize, src.type());
    GpuMat dst = _dst.getGpuMat();

    dst.setTo(Scalar::all(0), stream);

    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, const GpuMat& train,
        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
        const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    if (k == 2)
    {
        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
    }
    else
    {
        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
    }

    if (stream)
        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
    else
        trainIdx.setTo(Scalar::all(-1));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
}
Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
{
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
        {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
        {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
        {0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
        {0, ::sum::runSqr<int   , 1>, ::sum::runSqr<int   , 2>, ::sum::runSqr<int   , 3>, ::sum::runSqr<int   , 4>},
        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
    };

    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

    if (src.depth() == CV_64F)
    {
        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
    }

    Size buf_size;
    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    buf.setTo(Scalar::all(0));

    const func_t func = funcs[src.depth()][src.channels()];

    double result[4];
    func(src, buf.data, result, mask);

    return Scalar(result[0], result[1], result[2], result[3]);
}
Beispiel #5
0
void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
    const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             cudaStream_t stream);

    static const caller_t callersL1[] =
    {
        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
        matchL1_gpu<int>, matchL1_gpu<float>
    };
    static const caller_t callersL2[] =
    {
        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
    };
    static const caller_t callersHamming[] =
    {
        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
    };

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);

    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;

    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
    }

    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
        nMatches.setTo(Scalar::all(0));

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);

    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
}
Beispiel #6
0
void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
{
    CV_Assert(frame0.type() == CV_32FC1);
    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());

    newFrame.create(frame0.size(), frame0.type());

    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
    buf.setTo(Scalar::all(0));

    // occlusion masks
    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);

    // interpolated forward flow
    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);

    // interpolated backward flow
    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);

    size_t step = frame0.step;

    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);

    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStStreamHandler h(stream);

    NppStInterpolationState state;

    state.size         = NcvSize32u(frame0.cols, frame0.rows);
    state.nStep        = static_cast<Ncv32u>(step);
    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
    state.pos          = pos;
    state.pNewFrame    = newFrame.ptr<Ncv32f>();
    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
    state.ppBuffers[2] = fui.ptr<Ncv32f>();
    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
    state.ppBuffers[4] = bui.ptr<Ncv32f>();
    state.ppBuffers[5] = bvi.ptr<Ncv32f>();

    ncvSafeCall( nppiStInterpolateFrames(&state) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
Beispiel #7
0
void cv::cuda::BFMatcher_CUDA::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
    float maxDistance, const std::vector<GpuMat>& masks, Stream& stream)
{
    if (query.empty() || empty())
        return;

    using namespace cv::cuda::device::bf_radius_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             cudaStream_t stream);

    static const caller_t callersL1[] =
    {
        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
        matchL1_gpu<int>, matchL1_gpu<float>
    };
    static const caller_t callersL2[] =
    {
        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
    };
    static const caller_t callersHamming[] =
    {
        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
    };

    const int nQuery = query.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));
    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);

    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;

    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
    }

    nMatches.setTo(Scalar::all(0), stream);

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);

    std::vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end());
    std::vector<PtrStepSzb> masks_(masks.begin(), masks.end());

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
        float maxDistance, const vector<GpuMat>& masks, Stream& stream)
{
    if (query.empty() || empty())
        return;

    using namespace cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };

    const int nQuery = query.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));

    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
    }

    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
        nMatches.setTo(Scalar::all(0));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

    vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end());
    vector<PtrStepSzb> masks_(masks.begin(), masks.end());

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
         trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
}
Beispiel #9
0
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
{
    CV_Assert(src.type() == CV_8UC1);

    hist.create(1, 256, CV_32SC1);
    hist.setTo(Scalar::all(0));

    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
}
void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
    const GpuMat& maskCollection, Stream& stream)
{
    if (query.empty() || trainCollection.empty())
        return;

    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callersL1[] =
    {
        match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
        match2L1_gpu<unsigned short>, match2L1_gpu<short>,
        match2L1_gpu<int>, match2L1_gpu<float>
    };
    static const caller_t callersL2[] =
    {
        0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
        0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
        0/*match2L2_gpu<int>*/, match2L2_gpu<float>
    };
    static const caller_t callersHamming[] =
    {
        match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
        match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
        match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
    };

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);

    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;

    const int nQuery = query.rows;

    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
    ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);

    if (stream)
        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
    else
        trainIdx.setTo(Scalar::all(-1));

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);

    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
}
Beispiel #11
0
void cv::gpu::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
{
    GpuMat src = _src.getGpuMat();

    CV_Assert( src.type() == CV_8UC1 );

    _hist.create(1, 256, CV_32SC1);
    GpuMat hist = _hist.getGpuMat();

    hist.setTo(Scalar::all(0), stream);

    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
        const GpuMat& maskCollection, Stream& stream)
{
    if (query.empty() || trainCollection.empty())
        return;

    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
            match2L1_gpu<unsigned short>, match2L1_gpu<short>,
            match2L1_gpu<int>, match2L1_gpu<float>
        },
        {
            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
            0/*match2L2_gpu<int>*/, match2L2_gpu<float>
        },
        {
            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
            match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
        }
    };

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);

    const int nQuery = query.rows;

    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
    ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);

    if (stream)
        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
    else
        trainIdx.setTo(Scalar::all(-1));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
}
Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
{
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
#ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runAbs<uchar , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, ::sum::runAbs<float , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
    };
#else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
        {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
        {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
        {0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
        {0, ::sum::runAbs<int   , 1>, ::sum::runAbs<int   , 2>, ::sum::runAbs<int   , 3>, ::sum::runAbs<int   , 4>},
        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
    };
#endif

    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

    if (src.depth() == CV_64F)
    {
        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

    Size buf_size;
    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    buf.setTo(Scalar::all(0));

    const func_t func = funcs[src.depth()][src.channels()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    double result[4];
    func(src, buf.data, result, mask);

    return Scalar(result[0], result[1], result[2], result[3]);
}
Beispiel #14
0
void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate, Stream& stream)
{
    using namespace cv::gpu::cudev::mog;

    int ch = frame.channels();
    int work_ch = ch;

    if (nframes_ == 0 || learningRate >= 1.0f || frame.size() != frameSize_ || work_ch != mean_.channels())
        initialize(frame.size(), frame.type());

    fgmask.create(frameSize_, CV_8UC1);
    fgmask.setTo(cv::Scalar::all(0));

    ++nframes_;
    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(2 * nframes_, history);
    CV_Assert(learningRate >= 0.0f);

    mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_, learningRate, -learningRate * fCT, bShadowDetection, StreamAccessor::getStream(stream));
}
Beispiel #15
0
void cv::gpu::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors)
{
    using namespace cv::gpu::cudev::optical_flow;

    CV_Assert(u.type() == CV_32FC1);
    CV_Assert(v.type() == u.type() && v.size() == u.size());

    const int NEEDLE_MAP_SCALE = 16;

    const int x_needles = u.cols / NEEDLE_MAP_SCALE;
    const int y_needles = u.rows / NEEDLE_MAP_SCALE;

    GpuMat u_avg(y_needles, x_needles, CV_32FC1);
    GpuMat v_avg(y_needles, x_needles, CV_32FC1);

    NeedleMapAverage_gpu(u, v, u_avg, v_avg);

    const int NUM_VERTS_PER_ARROW = 6;

    const int num_arrows = x_needles * y_needles * NUM_VERTS_PER_ARROW;

    vertex.create(1, num_arrows, CV_32FC3);
    colors.create(1, num_arrows, CV_32FC3);

    colors.setTo(Scalar::all(1.0));

    double uMax, vMax;
    minMax(u_avg, 0, &uMax);
    minMax(v_avg, 0, &vMax);

    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));

    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);

    cvtColor(colors, colors, COLOR_HSV2RGB);
}
Beispiel #16
0
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
                          GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
                          GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());

    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));

    const Scalar zero = Scalar::all(0);

    cudaStream_t cudaStream = StreamAccessor::getStream(stream);

    ////////////////////////////////////////////////////////////////////////////////////////////
    // Init

    int rows = left.rows;
    int cols = left.cols;

    rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
    int levels = rthis.levels;

    AutoBuffer<int> buf(levels * 4);

    int* cols_pyr = buf;
    int* rows_pyr = cols_pyr + levels;
    int* nr_plane_pyr = rows_pyr + levels;
    int* step_pyr = nr_plane_pyr + levels;

    cols_pyr[0] = cols;
    rows_pyr[0] = rows;
    nr_plane_pyr[0] = rthis.nr_plane;

    const int n = 64;
    step_pyr[0] = static_cast<int>(alignSize(cols * sizeof(T), n) / sizeof(T));
    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i] = (cols_pyr[i-1] + 1) / 2;
        rows_pyr[i] = (rows_pyr[i-1] + 1) / 2;

        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;

        step_pyr[i] = static_cast<int>(alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T));
    }

    Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
    Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);

    u[0].create(msg_size, DataType<T>::type);
    d[0].create(msg_size, DataType<T>::type);
    l[0].create(msg_size, DataType<T>::type);
    r[0].create(msg_size, DataType<T>::type);

    u[1].create(msg_size, DataType<T>::type);
    d[1].create(msg_size, DataType<T>::type);
    l[1].create(msg_size, DataType<T>::type);
    r[1].create(msg_size, DataType<T>::type);

    disp_selected_pyr[0].create(msg_size, DataType<T>::type);
    disp_selected_pyr[1].create(msg_size, DataType<T>::type);

    data_cost.create(data_cost_size, DataType<T>::type);
    data_cost_selected.create(msg_size, DataType<T>::type);

    step_pyr[0] = static_cast<int>(data_cost.step / sizeof(T));

    Size temp_size = data_cost_size;
    if (data_cost_size.width * data_cost_size.height < step_pyr[levels - 1] * rows_pyr[levels - 1] * rthis.ndisp)
        temp_size = Size(step_pyr[levels - 1], rows_pyr[levels - 1] * rthis.ndisp);

    temp.create(temp_size, DataType<T>::type);

    ////////////////////////////////////////////////////////////////////////////
    // Compute

    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    if (stream)
    {
        stream.enqueueMemSet(l[0], zero);
        stream.enqueueMemSet(d[0], zero);
        stream.enqueueMemSet(r[0], zero);
        stream.enqueueMemSet(u[0], zero);
        
        stream.enqueueMemSet(l[1], zero);
        stream.enqueueMemSet(d[1], zero);
        stream.enqueueMemSet(r[1], zero);
        stream.enqueueMemSet(u[1], zero);

        stream.enqueueMemSet(data_cost, zero);
        stream.enqueueMemSet(data_cost_selected, zero);
    }
    else
    {
        l[0].setTo(zero);
        d[0].setTo(zero);
        r[0].setTo(zero);
        u[0].setTo(zero);

        l[1].setTo(zero);
        d[1].setTo(zero);
        r[1].setTo(zero);
        u[1].setTo(zero);

        data_cost.setTo(zero);
        data_cost_selected.setTo(zero);
    }

    int cur_idx = 0;

    for (int i = levels - 1; i >= 0; i--)
    {
        if (i == levels - 1)
        {
            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
        }
        else
        {
            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);

            int new_idx = (cur_idx + 1) & 1;

            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);

            cur_idx = new_idx;
        }

        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
    }

    if (disp.empty())
        disp.create(rows, cols, CV_16S);

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

    if (stream)
        stream.enqueueMemSet(out, zero);
    else
        out.setTo(zero);

    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
        if (stream)
            stream.enqueueConvert(out, disp, disp.type());
        else
            out.convertTo(disp, disp.type());
    }
}
Beispiel #17
0
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());

    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));

    const Scalar zero = Scalar::all(0);

    cudaStream_t cudaStream = StreamAccessor::getStream(stream);

    ////////////////////////////////////////////////////////////////////////////////////////////
    // Init

    int rows = left.rows;
    int cols = left.cols;

    rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
    int levels = rthis.levels;

    // compute sizes
    AutoBuffer<int> buf(levels * 3);
    int* cols_pyr = buf;
    int* rows_pyr = cols_pyr + levels;
    int* nr_plane_pyr = rows_pyr + levels;

    cols_pyr[0]     = cols;
    rows_pyr[0]     = rows;
    nr_plane_pyr[0] = rthis.nr_plane;

    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i]     = cols_pyr[i-1] / 2;
        rows_pyr[i]     = rows_pyr[i-1] / 2;
        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
    }


    GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;


    //allocate buffers
    int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
    buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
    buffers_count += 1; //  data_cost_selected
    mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type);

    data_cost          = mbuf.rowRange(0, rows * rthis.nr_plane * 2);
    data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane);

    for(int k = 0; k < 2; ++k) // in/out
    {
        GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows);
        GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);

        GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
        for(int _r = 0; _r < 5; ++_r)
        {
            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
            assert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
        }
    };

    size_t elem_step = mbuf.step / sizeof(T);

    Size temp_size = data_cost.size();
    if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)
        temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp);

    temp.create(temp_size, DataType<T>::type);

    ////////////////////////////////////////////////////////////////////////////
    // Compute

    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    if (stream)
    {
        stream.enqueueMemSet(l[0], zero);
        stream.enqueueMemSet(d[0], zero);
        stream.enqueueMemSet(r[0], zero);
        stream.enqueueMemSet(u[0], zero);

        stream.enqueueMemSet(l[1], zero);
        stream.enqueueMemSet(d[1], zero);
        stream.enqueueMemSet(r[1], zero);
        stream.enqueueMemSet(u[1], zero);

        stream.enqueueMemSet(data_cost, zero);
        stream.enqueueMemSet(data_cost_selected, zero);
    }
    else
    {
        l[0].setTo(zero);
        d[0].setTo(zero);
        r[0].setTo(zero);
        u[0].setTo(zero);

        l[1].setTo(zero);
        d[1].setTo(zero);
        r[1].setTo(zero);
        u[1].setTo(zero);

        data_cost.setTo(zero);
        data_cost_selected.setTo(zero);
    }

    int cur_idx = 0;

    for (int i = levels - 1; i >= 0; i--)
    {
        if (i == levels - 1)
        {
            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
        }
        else
        {
            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step,
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);

            int new_idx = (cur_idx + 1) & 1;

            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i],
                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);

            cur_idx = new_idx;
        }

        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step,
                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
    }

    if (disp.empty())
        disp.create(rows, cols, CV_16S);

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

    if (stream)
        stream.enqueueMemSet(out, zero);
    else
        out.setTo(zero);

    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
        if (stream)
            stream.enqueueConvert(out, disp, disp.type());
        else
            out.convertTo(disp, disp.type());
    }
}
Beispiel #18
0
inline
void Stream::enqueueMemSet(GpuMat& src, Scalar val)
{
    src.setTo(val, *this);
}
Beispiel #19
0
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{
#ifndef HAVE_CUBLAS
    (void)src1;
    (void)src2;
    (void)alpha;
    (void)src3;
    (void)beta;
    (void)dst;
    (void)flags;
    (void)stream;
    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
#else
    // CUBLAS works with column-major matrices

    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));

    if (src1.depth() == CV_64F)
    {
        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

    bool tr1 = (flags & GEMM_1_T) != 0;
    bool tr2 = (flags & GEMM_2_T) != 0;
    bool tr3 = (flags & GEMM_3_T) != 0;

    if (src1.type() == CV_64FC2)
    {
        if (tr1 || tr2 || tr3)
            CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
    }

    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
    Size dstSize(src2Size.width, src1Size.height);

    CV_Assert(src1Size.width == src2Size.height);
    CV_Assert(src3.empty() || src3Size == dstSize);

    dst.create(dstSize, src1.type());

    if (beta != 0)
    {
        if (src3.empty())
        {
            if (stream)
                stream.enqueueMemSet(dst, Scalar::all(0));
            else
                dst.setTo(Scalar::all(0));
        }
        else
        {
            if (tr3)
            {
                transpose(src3, dst, stream);
            }
            else
            {
                if (stream)
                    stream.enqueueCopy(src3, dst);
                else
                    src3.copyTo(dst);
            }
        }
    }

    cublasHandle_t handle;
    cublasSafeCall( cublasCreate_v2(&handle) );

    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );

    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );

    const float alphaf = static_cast<float>(alpha);
    const float betaf = static_cast<float>(beta);

    const cuComplex alphacf = make_cuComplex(alphaf, 0);
    const cuComplex betacf = make_cuComplex(betaf, 0);

    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);

    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;

    switch (src1.type())
    {
    case CV_32FC1:
        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphaf,
            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
            &betaf,
            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
        break;

    case CV_64FC1:
        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alpha,
            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
            &beta,
            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
        break;

    case CV_32FC2:
        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphacf,
            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
            &betacf,
            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
        break;

    case CV_64FC2:
        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphac,
            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
            &betac,
            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
        break;
    }

    cublasSafeCall( cublasDestroy_v2(handle) );
#endif
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
    float maxDistance, const vector<GpuMat>& masks, Stream& stream)
{
    if (query.empty() || empty())
        return;

    using namespace cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };

    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    const int nQuery = query.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));

    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
    }

    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
        nMatches.setTo(Scalar::all(0));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

    vector<DevMem2Db> trains_(trainDescCollection.begin(), trainDescCollection.end());
    vector<DevMem2Db> masks_(masks.begin(), masks.end());

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
        trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
}
Beispiel #21
0
void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
{
    using namespace cv::gpu::device::pyrlk;

    if (prevPts.empty())
    {
        nextPts.release();
        status.release();
        if (err) err->release();
        return;
    }

    dim3 block, patch;
    calcPatchSize(winSize, block, patch, isDeviceArch11_);

    CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4);
    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
    CV_Assert(maxLevel >= 0);
    CV_Assert(winSize.width > 2 && winSize.height > 2);
    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);

    if (useInitialFlow)
        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
    else
        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);

    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
    GpuMat temp2 = nextPts.reshape(1);
    multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);

    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
    status.setTo(Scalar::all(1));

    if (err)
        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);

    // build the image pyramids.

    prevPyr_.resize(maxLevel + 1);
    nextPyr_.resize(maxLevel + 1);

    int cn = prevImg.channels();

    if (cn == 1 || cn == 4)
    {
        prevImg.convertTo(prevPyr_[0], CV_32F);
        nextImg.convertTo(nextPyr_[0], CV_32F);
    }
    else
    {
        cvtColor(prevImg, dx_calcBuf_, COLOR_BGR2BGRA);
        dx_calcBuf_.convertTo(prevPyr_[0], CV_32F);

        cvtColor(nextImg, dx_calcBuf_, COLOR_BGR2BGRA);
        dx_calcBuf_.convertTo(nextPyr_[0], CV_32F);
    }

    for (int level = 1; level <= maxLevel; ++level)
    {
        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
    }

    loadConstants(make_int2(winSize.width, winSize.height), iters);

    for (int level = maxLevel; level >= 0; level--)
    {
        if (cn == 1)
        {
            lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
        else
        {
            lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
    }
}
Beispiel #22
0
inline
void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask)
{
    src.setTo(val, mask, *this);
}
Beispiel #23
0
void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)
{
    GpuMat src = _src.getGpuMat();
    Mat M = _M.getMat();

    CV_Assert( M.rows == 3 && M.cols == 3 );

    const int interpolation = flags & INTER_MAX;

    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ;

    _dst.create(dsize, src.type());
    GpuMat dst = _dst.getGpuMat();

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    static const bool useNppTab[6][4][3] =
    {
        {
            {false, false, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        }
    };

    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
    // NPP bug on float data
    useNpp = useNpp && src.depth() != CV_32F;

    if (useNpp)
    {
        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
            {
                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
            },
            {
                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
            }
        };

        dst.setTo(borderValue, stream);

        double coeffs[3][3];
        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());

        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream));
    }
    else
    {
        using namespace cv::cuda::device::imgproc;

        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        float coeffs[3 * 3];
        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);

        if (flags & WARP_INVERSE_MAP)
            M.convertTo(coeffsMat, coeffsMat.type());
        else
        {
            cv::Mat iM;
            invert(M, iM);
            iM.convertTo(coeffsMat, coeffsMat.type());
        }

        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
            dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
    }
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, 
    const GpuMat& mask, Stream& stream)
{
    if (query.empty() || train.empty())
        return;

    using namespace ::cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, 
            matchL1_gpu<unsigned short>, matchL1_gpu<short>, 
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, 
            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, 
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };

    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS));

    const int nQuery = query.rows;
    const int nTrain = train.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));

    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
    if (trainIdx.empty())
    {
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
    }
    
    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
        nMatches.setTo(Scalar::all(0));

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);    

    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
}