Beispiel #1
0
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
{
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();

    if( rtype < 0 )
        rtype = src.type();
    else
        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels());

    int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
    if( sdepth == ddepth && noScale )
    {
        src.copyTo(dst);
        return;
    }

    GpuMat temp;
    const GpuMat* psrc = &src;
    if( sdepth != ddepth && psrc == &dst )
        psrc = &(temp = src);

    dst.create( src.size(), rtype );
    matrix_operations::convert_to(*psrc, sdepth, dst, ddepth, psrc->channels(), alpha, beta, impl->stream);
}
    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
    {
        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);

        const int defaultObjSearchNum = 100;
        if (objectsBuf.empty())
        {
            objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
        }

        cv::Size ncvMinSize = this->getClassifierCvSize();

        if (ncvMinSize.width < minSize.width && ncvMinSize.height < minSize.height)
        {
            ncvMinSize.width = minSize.width;
            ncvMinSize.height = minSize.height;
        }

        unsigned int numDetections;
        ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));

        return numDetections;
    }
Beispiel #3
0
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
{
    CV_Assert(src.type() == CV_8UC1);

    dst.create(src.size(), src.type());

    int intBufSize;
    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );

    ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf);

    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
    GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);

    calcHist(src, hist, s);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );

    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
}
Beispiel #4
0
void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
{
    CV_Assert(src.depth() == CV_8U && src.channels() < 4);

    int border_size = search_window/2 + block_window/2;
    Size esize = src.size() + Size(border_size, border_size) * 2;

    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);

    cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));

    int bcols, brows;
    cudev::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
    buffer.create(brows, bcols, CV_32S);

    using namespace cv::gpu::cudev::imgproc;
    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};

    dst.create(src.size(), src.type());
    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
}
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
                          GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
                          GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream)
{
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());

    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3));

    const Scalar zero = Scalar::all(0);

    ////////////////////////////////////////////////////////////////////////////////////////////
    // Init

    int rows = left.rows;
    int cols = left.cols;

    rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
    int levels = rthis.levels;

    AutoBuffer<int> buf(levels * 4);

    int* cols_pyr = buf;
    int* rows_pyr = cols_pyr + levels;
    int* nr_plane_pyr = rows_pyr + levels;
    int* step_pyr = nr_plane_pyr + levels;

    cols_pyr[0] = cols;
    rows_pyr[0] = rows;
    nr_plane_pyr[0] = rthis.nr_plane;

    const int n = 64;
    step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T);
    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i] = (cols_pyr[i-1] + 1) / 2;
        rows_pyr[i] = (rows_pyr[i-1] + 1) / 2;

        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;

        step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T);
    }

    Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
    Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);

    u[0].create(msg_size, DataType<T>::type);
    d[0].create(msg_size, DataType<T>::type);
    l[0].create(msg_size, DataType<T>::type);
    r[0].create(msg_size, DataType<T>::type);

    u[1].create(msg_size, DataType<T>::type);
    d[1].create(msg_size, DataType<T>::type);
    l[1].create(msg_size, DataType<T>::type);
    r[1].create(msg_size, DataType<T>::type);

    disp_selected_pyr[0].create(msg_size, DataType<T>::type);
    disp_selected_pyr[1].create(msg_size, DataType<T>::type);

    data_cost.create(data_cost_size, DataType<T>::type);
    data_cost_selected.create(msg_size, DataType<T>::type);

    step_pyr[0] = data_cost.step / sizeof(T);

    Size temp_size = data_cost_size;
    if (data_cost_size.width * data_cost_size.height < step_pyr[levels - 1] * rows_pyr[levels - 1] * rthis.ndisp)
        temp_size = Size(step_pyr[levels - 1], rows_pyr[levels - 1] * rthis.ndisp);

    temp.create(temp_size, DataType<T>::type);

    ////////////////////////////////////////////////////////////////////////////
    // Compute

    csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
        rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    l[0] = zero;
    d[0] = zero;
    r[0] = zero;
    u[0] = zero;

    l[1] = zero;
    d[1] = zero;
    r[1] = zero;
    u[1] = zero;

    data_cost = zero;
    data_cost_selected = zero;

    int cur_idx = 0;

    for (int i = levels - 1; i >= 0; i--)
    {
        if (i == levels - 1)
        {
            csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, stream);
        }
        else
        {
            csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);

            int new_idx = (cur_idx + 1) & 1;

            csbp::init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
                               u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                               disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
                               data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
                               cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);

            cur_idx = new_idx;
        }

        csbp::calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                                  data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
                                  rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, stream);
    }

    if (disp.empty())
        disp.create(rows, cols, CV_16S);

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
    out = zero;

    csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                       data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], stream);

    if (disp.type() != CV_16S)
        out.convertTo(disp, disp.type());
}
Beispiel #6
0
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{
#ifndef HAVE_CUBLAS
    (void)src1;
    (void)src2;
    (void)alpha;
    (void)src3;
    (void)beta;
    (void)dst;
    (void)flags;
    (void)stream;
    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
#else
    // CUBLAS works with column-major matrices

    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));

    if (src1.depth() == CV_64F)
    {
        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

    bool tr1 = (flags & GEMM_1_T) != 0;
    bool tr2 = (flags & GEMM_2_T) != 0;
    bool tr3 = (flags & GEMM_3_T) != 0;

    if (src1.type() == CV_64FC2)
    {
        if (tr1 || tr2 || tr3)
            CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
    }

    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
    Size dstSize(src2Size.width, src1Size.height);

    CV_Assert(src1Size.width == src2Size.height);
    CV_Assert(src3.empty() || src3Size == dstSize);

    dst.create(dstSize, src1.type());

    if (beta != 0)
    {
        if (src3.empty())
        {
            if (stream)
                stream.enqueueMemSet(dst, Scalar::all(0));
            else
                dst.setTo(Scalar::all(0));
        }
        else
        {
            if (tr3)
            {
                transpose(src3, dst, stream);
            }
            else
            {
                if (stream)
                    stream.enqueueCopy(src3, dst);
                else
                    src3.copyTo(dst);
            }
        }
    }

    cublasHandle_t handle;
    cublasSafeCall( cublasCreate_v2(&handle) );

    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );

    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );

    const float alphaf = static_cast<float>(alpha);
    const float betaf = static_cast<float>(beta);

    const cuComplex alphacf = make_cuComplex(alphaf, 0);
    const cuComplex betacf = make_cuComplex(betaf, 0);

    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);

    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;

    switch (src1.type())
    {
    case CV_32FC1:
        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphaf,
            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
            &betaf,
            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
        break;

    case CV_64FC1:
        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alpha,
            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
            &beta,
            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
        break;

    case CV_32FC2:
        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphacf,
            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
            &betacf,
            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
        break;

    case CV_64FC2:
        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
            &alphac,
            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
            &betac,
            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
        break;
    }

    cublasSafeCall( cublasDestroy_v2(handle) );
#endif
}
Beispiel #7
0
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
{
    class LevelsInit
    {
    public:
        Npp32s pLevels[256];
        const Npp32s* pLevels3[3];
        int nValues3[3];

#if (CUDA_VERSION > 4020)
        GpuMat d_pLevels;
#endif

        LevelsInit()
        {
            nValues3[0] = nValues3[1] = nValues3[2] = 256;
            for (int i = 0; i < 256; ++i)
                pLevels[i] = i;


#if (CUDA_VERSION <= 4020)
            pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
#else
            d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
            pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
#endif
        }
    };
    static LevelsInit lvls;

    int cn = src.channels();

    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
    CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous());

    dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));

    NppiSize sz;
    sz.height = src.rows;
    sz.width = src.cols;

    Mat nppLut;
    lut.convertTo(nppLut, CV_32S);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    if (src.type() == CV_8UC1)
    {
#if (CUDA_VERSION <= 4020)
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
#else
        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
#endif
    }
    else
    {
        const Npp32s* pValues3[3];

        Mat nppLut3[3];
        if (nppLut.channels() == 1)
        {
#if (CUDA_VERSION <= 4020)
            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
#else
            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
#endif
        }
        else
        {
            cv::split(nppLut, nppLut3);

#if (CUDA_VERSION <= 4020)
            pValues3[0] = nppLut3[0].ptr<Npp32s>();
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
#else
            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));

            pValues3[0] = d_nppLut0.ptr<Npp32s>();
            pValues3[1] = d_nppLut1.ptr<Npp32s>();
            pValues3[2] = d_nppLut2.ptr<Npp32s>();
#endif
        }

        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
Beispiel #8
0
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
{
    typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);

#ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[6][4] =
    {
        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
    };
#else
    static const func_t funcs[6][4] =
    {
        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
        {0 /*device::resize<schar>*/, 0 /*device::resize<char2>*/  , 0 /*device::resize<char3>*/, 0 /*device::resize<char4>*/},
        {device::resize<ushort>     , 0 /*device::resize<ushort2>*/, device::resize<ushort3>    , device::resize<ushort4>    },
        {device::resize<short>      , 0 /*device::resize<short2>*/ , device::resize<short3>     , device::resize<short4>     },
        {0 /*device::resize<int>*/  , 0 /*device::resize<int2>*/   , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ },
        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
    };
#endif

    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
    CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );

    if (dsize == Size())
    {
        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
    }
    else
    {
        fx = static_cast<double>(dsize.width) / src.cols;
        fy = static_cast<double>(dsize.height) / src.rows;
    }

    dst.create(dsize, src.type());

    if (dsize == src.size())
    {
        if (stream)
            stream.enqueueCopy(src, dst);
        else
            src.copyTo(dst);
        return;
    }

    const func_t func = funcs[src.depth()][src.channels() - 1];

    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);
    PtrStepSzb wholeSrc(wholeSize.height, wholeSize.width, src.datastart, src.step);

    func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream));
}
Beispiel #9
0
void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
{
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);

    dst.create(src.rows + top + bottom, src.cols + left + right, src.type());

    cudaStream_t stream = StreamAccessor::getStream(s);

    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
    {
        NppiSize srcsz;
        srcsz.width  = src.cols;
        srcsz.height = src.rows;

        NppiSize dstsz;
        dstsz.width  = dst.cols;
        dstsz.height = dst.rows;

        NppStreamHandler h(stream);

        switch (src.type())
        {
        case CV_8UC1:
            {
                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_8UC4:
            {
                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_32SC1:
            {
                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_32FC1:
            {
                Npp32f val = saturate_cast<Npp32f>(value[0]);
                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        }

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
    else
    {
        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
        };

        caller_t func = callers[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, dst, top, left, borderType, value, stream);
    }
}
Beispiel #10
0
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{
    using namespace ::cv::gpu::device::matrix_reductions;

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
    CV_Assert(dim == 0 || dim == 1);
    CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);

    if (dtype < 0)
        dtype = src.depth();

    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));

    if (dim == 0)
    {
        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

        static const caller_t callers[6][6] =
        {
            {
                reduceRows_gpu<unsigned char, int, unsigned char>,
                0/*reduceRows_gpu<unsigned char, int, signed char>*/,
                0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
                0/*reduceRows_gpu<unsigned char, int, short>*/,
                reduceRows_gpu<unsigned char, int, int>,
                reduceRows_gpu<unsigned char, int, float>
            },
            {
                0/*reduceRows_gpu<signed char, int, unsigned char>*/,
                0/*reduceRows_gpu<signed char, int, signed char>*/,
                0/*reduceRows_gpu<signed char, int, unsigned short>*/,
                0/*reduceRows_gpu<signed char, int, short>*/,
                0/*reduceRows_gpu<signed char, int, int>*/,
                0/*reduceRows_gpu<signed char, int, float>*/
            },
            {
                0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
                0/*reduceRows_gpu<unsigned short, int, signed char>*/,
                reduceRows_gpu<unsigned short, int, unsigned short>,
                0/*reduceRows_gpu<unsigned short, int, short>*/,
                reduceRows_gpu<unsigned short, int, int>,
                reduceRows_gpu<unsigned short, int, float>
            },
            {
                0/*reduceRows_gpu<short, int, unsigned char>*/,
                0/*reduceRows_gpu<short, int, signed char>*/,
                0/*reduceRows_gpu<short, int, unsigned short>*/,
                reduceRows_gpu<short, int, short>,
                reduceRows_gpu<short, int, int>,
                reduceRows_gpu<short, int, float>
            },
            {
                0/*reduceRows_gpu<int, int, unsigned char>*/,
                0/*reduceRows_gpu<int, int, signed char>*/,
                0/*reduceRows_gpu<int, int, unsigned short>*/,
                0/*reduceRows_gpu<int, int, short>*/,
                reduceRows_gpu<int, int, int>,
                reduceRows_gpu<int, int, float>
            },
            {
                0/*reduceRows_gpu<float, float, unsigned char>*/,
                0/*reduceRows_gpu<float, float, signed char>*/,
                0/*reduceRows_gpu<float, float, unsigned short>*/,
                0/*reduceRows_gpu<float, float, short>*/,
                0/*reduceRows_gpu<float, float, int>*/,
                reduceRows_gpu<float, float, float>
            }
        };

        const caller_t func = callers[src.depth()][dst.depth()];

        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

        func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
    }
    else
    {
        typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);

        static const caller_t callers[6][6] =
        {
            {
                reduceCols_gpu<unsigned char, int, unsigned char>,
                0/*reduceCols_gpu<unsigned char, int, signed char>*/,
                0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
                0/*reduceCols_gpu<unsigned char, int, short>*/,
                reduceCols_gpu<unsigned char, int, int>,
                reduceCols_gpu<unsigned char, int, float>
            },
            {
                0/*reduceCols_gpu<signed char, int, unsigned char>*/,
                0/*reduceCols_gpu<signed char, int, signed char>*/,
                0/*reduceCols_gpu<signed char, int, unsigned short>*/,
                0/*reduceCols_gpu<signed char, int, short>*/,
                0/*reduceCols_gpu<signed char, int, int>*/,
                0/*reduceCols_gpu<signed char, int, float>*/
            },
            {
                0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
                0/*reduceCols_gpu<unsigned short, int, signed char>*/,
                reduceCols_gpu<unsigned short, int, unsigned short>,
                0/*reduceCols_gpu<unsigned short, int, short>*/,
                reduceCols_gpu<unsigned short, int, int>,
                reduceCols_gpu<unsigned short, int, float>
            },
            {
                0/*reduceCols_gpu<short, int, unsigned char>*/,
                0/*reduceCols_gpu<short, int, signed char>*/,
                0/*reduceCols_gpu<short, int, unsigned short>*/,
                reduceCols_gpu<short, int, short>,
                reduceCols_gpu<short, int, int>,
                reduceCols_gpu<short, int, float>
            },
            {
                0/*reduceCols_gpu<int, int, unsigned char>*/,
                0/*reduceCols_gpu<int, int, signed char>*/,
                0/*reduceCols_gpu<int, int, unsigned short>*/,
                0/*reduceCols_gpu<int, int, short>*/,
                reduceCols_gpu<int, int, int>,
                reduceCols_gpu<int, int, float>
            },
            {
                0/*reduceCols_gpu<float, unsigned char>*/,
                0/*reduceCols_gpu<float, signed char>*/,
                0/*reduceCols_gpu<float, unsigned short>*/,
                0/*reduceCols_gpu<float, short>*/,
                0/*reduceCols_gpu<float, int>*/,
                reduceCols_gpu<float, float, float>
            }
        };

        const caller_t func = callers[src.depth()][dst.depth()];

        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
    }
}
Beispiel #11
0
void cv::gpu::FarnebackOpticalFlow::operator ()(
        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
{
    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
    CV_Assert(frame0.size() == frame1.size());
    CV_Assert(polyN == 5 || polyN == 7);
    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);

    Stream streams[5];
    if (S(s))
        streams[0] = s;

    Size size = frame0.size();
    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;

    flowx.create(size, CV_32F);
    flowy.create(size, CV_32F);
    GpuMat flowx0 = flowx;
    GpuMat flowy0 = flowy;

    // Crop unnecessary levels
    double scale = 1;
    int numLevelsCropped = 0;
    for (; numLevelsCropped < numLevels; numLevelsCropped++)
    {
        scale *= pyrScale;
        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
            break;
    }

    streams[0].enqueueConvert(frame0, frames_[0], CV_32F);
    streams[1].enqueueConvert(frame1, frames_[1], CV_32F);

    if (fastPyramids)
    {
        // Build Gaussian pyramids using pyrDown()
        pyramid0_.resize(numLevelsCropped + 1);
        pyramid1_.resize(numLevelsCropped + 1);
        pyramid0_[0] = frames_[0];
        pyramid1_[0] = frames_[1];
        for (int i = 1; i <= numLevelsCropped; ++i)
        {
            pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
            pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
        }
    }

    setPolynomialExpansionConsts(polyN, polySigma);
    device::optflow_farneback::setUpdateMatricesConsts();

    for (int k = numLevelsCropped; k >= 0; k--)
    {
        streams[0].waitForCompletion();

        scale = 1;
        for (int i = 0; i < k; i++)
            scale *= pyrScale;

        double sigma = (1./scale - 1) * 0.5;
        int smoothSize = cvRound(sigma*5) | 1;
        smoothSize = std::max(smoothSize, 3);

        int width = cvRound(size.width*scale);
        int height = cvRound(size.height*scale);

        if (fastPyramids)
        {
            width = pyramid0_[k].cols;
            height = pyramid0_[k].rows;
        }

        if (k > 0)
        {
            curFlowX.create(height, width, CV_32F);
            curFlowY.create(height, width, CV_32F);
        }
        else
        {
            curFlowX = flowx0;
            curFlowY = flowy0;
        }

        if (!prevFlowX.data)
        {
            if (flags & OPTFLOW_USE_INITIAL_FLOW)
            {
#if ENABLE_GPU_RESIZE
                resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
                resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
                streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale);
                streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale);
#else
                Mat tmp1, tmp2;
                flowx0.download(tmp1);
                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
                tmp2 *= scale;
                curFlowX.upload(tmp2);
                flowy0.download(tmp1);
                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
                tmp2 *= scale;
                curFlowY.upload(tmp2);
#endif
            }
            else
            {
                streams[0].enqueueMemSet(curFlowX, 0);
                streams[1].enqueueMemSet(curFlowY, 0);
            }
        }
        else
        {
#if ENABLE_GPU_RESIZE
            resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
            resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
            streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale);
            streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale);
#else
            Mat tmp1, tmp2;
            prevFlowX.download(tmp1);
            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
            tmp2 *= 1./pyrScale;
            curFlowX.upload(tmp2);
            prevFlowY.download(tmp1);
            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
            tmp2 *= 1./pyrScale;
            curFlowY.upload(tmp2);
#endif
        }

        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
        GpuMat R[2] =
        {
            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
            allocMatFromBuf(5*height, width, CV_32F, R_[1])
        };

        if (fastPyramids)
        {
            device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
            device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
        }
        else
        {
            GpuMat blurredFrame[2] =
            {
                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
            };
            GpuMat pyrLevel[2] =
            {
                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
            };

            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);

            for (int i = 0; i < 2; i++)
            {
                device::optflow_farneback::gaussianBlurGpu(
                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101_GPU, S(streams[i]));
#if ENABLE_GPU_RESIZE
                resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]);
#else
                Mat tmp1, tmp2;
                tmp[i].download(tmp1);
                resize(tmp1, tmp2, Size(width, height), INTER_LINEAR);
                I[i].upload(tmp2);
#endif
                device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
            }
        }

        streams[1].waitForCompletion();
        device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));

        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
        {
            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
        }
        for (int i = 0; i < numIters; i++)
        {
            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
            else
                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
        }

        prevFlowX = curFlowX;
        prevFlowY = curFlowY;
    }

    flowx = curFlowX;
    flowy = curFlowY;

    if (!S(s))
        streams[0].waitForCompletion();
}
Beispiel #12
0
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
{
#if (CUDA_VERSION < 5000)
    CV_Assert(terminals.type() == CV_32S);
#else
    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
#endif

    Size src_size = terminals.size();

    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(leftTransp.type() == terminals.type());

    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(rightTransp.type() == terminals.type());

    CV_Assert(top.size() == src_size);
    CV_Assert(top.type() == terminals.type());

    CV_Assert(topLeft.size() == src_size);
    CV_Assert(topLeft.type() == terminals.type());

    CV_Assert(topRight.size() == src_size);
    CV_Assert(topRight.type() == terminals.type());

    CV_Assert(bottom.size() == src_size);
    CV_Assert(bottom.type() == terminals.type());

    CV_Assert(bottomLeft.size() == src_size);
    CV_Assert(bottomLeft.type() == terminals.type());

    CV_Assert(bottomRight.size() == src_size);
    CV_Assert(bottomRight.type() == terminals.type());

    labels.create(src_size, CV_8U);

    NppiSize sznpp;
    sznpp.width = src_size.width;
    sznpp.height = src_size.height;

    int bufsz;
    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );

    ensureSizeIsEnough(1, bufsz, CV_8U, buf);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);

#if (CUDA_VERSION < 5000)
    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
#else
    if (terminals.type() == CV_32S)
    {
        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    }
    else
    {
        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    }
#endif

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
Beispiel #13
0
void cv::cuda::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
{
    CV_Assert( prev.type() == CV_8UC1 );
    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );

    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);

    velx.create(velSize, CV_32FC1);
    vely.create(velSize, CV_32FC1);

    // scanning scheme coordinates
    std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
    int ssCount = 0;

    // Calculate scanning scheme
    const int minCount = std::min(maxRange.width, maxRange.height);

    // use spiral search pattern
    //
    //     9 10 11 12
    //     8  1  2 13
    //     7  *  3 14
    //     6  5  4 15
    //... 20 19 18 17
    //

    for (int i = 0; i < minCount; ++i)
    {
        // four cycles along sides
        int x = -i - 1, y = x;

        // upper side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) ++x;
            ss[ssCount].y = (short) y;
        }

        // right side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) x;
            ss[ssCount].y = (short) ++y;
        }

        // bottom side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) --x;
            ss[ssCount].y = (short) y;
        }

        // left side
        for (int j = -i; j <= i + 1; ++j, ++ssCount)
        {
            ss[ssCount].x = (short) x;
            ss[ssCount].y = (short) --y;
        }
    }

    // the rest part
    if (maxRange.width < maxRange.height)
    {
        const int xleft = -minCount;

        // cycle by neighbor rings
        for (int i = minCount; i < maxRange.height; ++i)
        {
            // two cycles by x
            int y = -(i + 1);
            int x = xleft;

            // upper side
            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }

            x = xleft;
            y = -y;

            // bottom side
            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }
        }
    }
    else if (maxRange.width > maxRange.height)
    {
        const int yupper = -minCount;

        // cycle by neighbor rings
        for (int i = minCount; i < maxRange.width; ++i)
        {
            // two cycles by y
            int x = -(i + 1);
            int y = yupper;

            // left side
            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }

            y = yupper;
            x = -x;

            // right side
            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
            {
                ss[ssCount].x = (short) x;
                ss[ssCount].y = (short) y;
            }
        }
    }

    const cudaStream_t stream = StreamAccessor::getStream(st);

    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
    if (stream == 0)
        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
    else
        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );

    const int maxX = prev.cols - blockSize.width;
    const int maxY = prev.rows - blockSize.height;

    const int SMALL_DIFF = 2;
    const int BIG_DIFF = 128;

    const int blSize = blockSize.area();
    const int acceptLevel = blSize * SMALL_DIFF;
    const int escapeLevel = blSize * BIG_DIFF;

    optflowbm::calc(prev, curr, velx, vely,
                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
}
Beispiel #14
0
void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
{
    CV_Assert(M.rows == 3 && M.cols == 3);

    int interpolation = flags & INTER_MAX;

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    static const bool useNppTab[6][4][3] =
    {
        {
            {false, false, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, false}
        },
        {
            {false, false, false},
            {false, false, false},
            {false, false, false},
            {false, false, false}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        },
        {
            {false, true, true},
            {false, false, false},
            {false, true, true},
            {false, false, true}
        }
    };

    bool useNpp = borderMode == BORDER_CONSTANT;
    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
    #ifdef linux
        // NPP bug on float data
        useNpp = useNpp && src.depth() != CV_32F;
    #endif

    if (useNpp)
    {
        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
            {
                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
            },
            {
                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
                {0, 0, 0, 0},
                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
            }
        };

        double coeffs[3][3];
        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());

        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
    }
    else
    {
        using namespace cv::gpu::device::imgproc;

        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, int cc);

        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

        dst.create(dsize, src.type());

        float coeffs[3 * 3];
        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);

        if (flags & WARP_INVERSE_MAP)
            M.convertTo(coeffsMat, coeffsMat.type());
        else
        {
            cv::Mat iM;
            invert(M, iM);
            iM.convertTo(coeffsMat, coeffsMat.type());
        }

        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

        DeviceInfo info;
        int cc = info.majorVersion() * 10 + info.minorVersion();

        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
    }
}
Beispiel #15
0
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR
            || interpolation == INTER_CUBIC || interpolation == INTER_AREA);
    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));

    if (dsize == Size())
        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
    else
    {
        fx = static_cast<double>(dsize.width) / src.cols;
        fy = static_cast<double>(dsize.height) / src.rows;
    }
    if (dsize != dst.size())
        dst.create(dsize, src.type());

    if (dsize == src.size())
    {
        if (s)
            s.enqueueCopy(src, dst);
        else
            src.copyTo(dst);
        return;
    }

    cudaStream_t stream = StreamAccessor::getStream(s);

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || (src.type() == CV_8UC4 && interpolation != INTER_AREA));

    if (useNpp)
    {
        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
                                    double xFactor, double yFactor, int eInterpolation);

        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };

        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};

        NppiSize srcsz;
        srcsz.width  = wholeSize.width;
        srcsz.height = wholeSize.height;

        NppiRect srcrect;
        srcrect.x = ofs.x;
        srcrect.y = ofs.y;
        srcrect.width  = src.cols;
        srcrect.height = src.rows;

        NppiSize dstsz;
        dstsz.width  = dst.cols;
        dstsz.height = dst.rows;

        NppStreamHandler h(stream);

        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
    else
    {
        using namespace ::cv::gpu::device::imgproc;

        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);

        static const func_t funcs[6][4] =
        {
            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
        };

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
    }
}
Beispiel #16
0
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{
    CV_Assert( src.channels() <= 4 );
    CV_Assert( dim == 0 || dim == 1 );
    CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );

    if (dtype < 0)
        dtype = src.depth();

    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));

    if (dim == 0)
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
        static const func_t funcs[7][7] =
        {
            {
                ::reduce::rows<unsigned char, int, unsigned char>,
                0/*::reduce::rows<unsigned char, int, signed char>*/,
                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
                0/*::reduce::rows<unsigned char, int, short>*/,
                ::reduce::rows<unsigned char, int, int>,
                ::reduce::rows<unsigned char, float, float>,
                ::reduce::rows<unsigned char, double, double>
            },
            {
                0/*::reduce::rows<signed char, int, unsigned char>*/,
                0/*::reduce::rows<signed char, int, signed char>*/,
                0/*::reduce::rows<signed char, int, unsigned short>*/,
                0/*::reduce::rows<signed char, int, short>*/,
                0/*::reduce::rows<signed char, int, int>*/,
                0/*::reduce::rows<signed char, float, float>*/,
                0/*::reduce::rows<signed char, double, double>*/
            },
            {
                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
                0/*::reduce::rows<unsigned short, int, signed char>*/,
                ::reduce::rows<unsigned short, int, unsigned short>,
                0/*::reduce::rows<unsigned short, int, short>*/,
                ::reduce::rows<unsigned short, int, int>,
                ::reduce::rows<unsigned short, float, float>,
                ::reduce::rows<unsigned short, double, double>
            },
            {
                0/*::reduce::rows<short, int, unsigned char>*/,
                0/*::reduce::rows<short, int, signed char>*/,
                0/*::reduce::rows<short, int, unsigned short>*/,
                ::reduce::rows<short, int, short>,
                ::reduce::rows<short, int, int>,
                ::reduce::rows<short, float, float>,
                ::reduce::rows<short, double, double>
            },
            {
                0/*::reduce::rows<int, int, unsigned char>*/,
                0/*::reduce::rows<int, int, signed char>*/,
                0/*::reduce::rows<int, int, unsigned short>*/,
                0/*::reduce::rows<int, int, short>*/,
                ::reduce::rows<int, int, int>,
                ::reduce::rows<int, float, float>,
                ::reduce::rows<int, double, double>
            },
            {
                0/*::reduce::rows<float, float, unsigned char>*/,
                0/*::reduce::rows<float, float, signed char>*/,
                0/*::reduce::rows<float, float, unsigned short>*/,
                0/*::reduce::rows<float, float, short>*/,
                0/*::reduce::rows<float, float, int>*/,
                ::reduce::rows<float, float, float>,
                ::reduce::rows<float, double, double>
            },
            {
                0/*::reduce::rows<double, double, unsigned char>*/,
                0/*::reduce::rows<double, double, signed char>*/,
                0/*::reduce::rows<double, double, unsigned short>*/,
                0/*::reduce::rows<double, double, short>*/,
                0/*::reduce::rows<double, double, int>*/,
                0/*::reduce::rows<double, double, float>*/,
                ::reduce::rows<double, double, double>
            }
        };

        const func_t func = funcs[src.depth()][dst.depth()];

        if (!func)
            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");

        func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
    }
    else
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
        static const func_t funcs[7][7] =
        {
            {
                ::reduce::cols<unsigned char, int, unsigned char>,
                0/*::reduce::cols<unsigned char, int, signed char>*/,
                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
                0/*::reduce::cols<unsigned char, int, short>*/,
                ::reduce::cols<unsigned char, int, int>,
                ::reduce::cols<unsigned char, float, float>,
                ::reduce::cols<unsigned char, double, double>
            },
            {
                0/*::reduce::cols<signed char, int, unsigned char>*/,
                0/*::reduce::cols<signed char, int, signed char>*/,
                0/*::reduce::cols<signed char, int, unsigned short>*/,
                0/*::reduce::cols<signed char, int, short>*/,
                0/*::reduce::cols<signed char, int, int>*/,
                0/*::reduce::cols<signed char, float, float>*/,
                0/*::reduce::cols<signed char, double, double>*/
            },
            {
                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
                0/*::reduce::cols<unsigned short, int, signed char>*/,
                ::reduce::cols<unsigned short, int, unsigned short>,
                0/*::reduce::cols<unsigned short, int, short>*/,
                ::reduce::cols<unsigned short, int, int>,
                ::reduce::cols<unsigned short, float, float>,
                ::reduce::cols<unsigned short, double, double>
            },
            {
                0/*::reduce::cols<short, int, unsigned char>*/,
                0/*::reduce::cols<short, int, signed char>*/,
                0/*::reduce::cols<short, int, unsigned short>*/,
                ::reduce::cols<short, int, short>,
                ::reduce::cols<short, int, int>,
                ::reduce::cols<short, float, float>,
                ::reduce::cols<short, double, double>
            },
            {
                0/*::reduce::cols<int, int, unsigned char>*/,
                0/*::reduce::cols<int, int, signed char>*/,
                0/*::reduce::cols<int, int, unsigned short>*/,
                0/*::reduce::cols<int, int, short>*/,
                ::reduce::cols<int, int, int>,
                ::reduce::cols<int, float, float>,
                ::reduce::cols<int, double, double>
            },
            {
                0/*::reduce::cols<float, float, unsigned char>*/,
                0/*::reduce::cols<float, float, signed char>*/,
                0/*::reduce::cols<float, float, unsigned short>*/,
                0/*::reduce::cols<float, float, short>*/,
                0/*::reduce::cols<float, float, int>*/,
                ::reduce::cols<float, float, float>,
                ::reduce::cols<float, double, double>
            },
            {
                0/*::reduce::cols<double, double, unsigned char>*/,
                0/*::reduce::cols<double, double, signed char>*/,
                0/*::reduce::cols<double, double, unsigned short>*/,
                0/*::reduce::cols<double, double, short>*/,
                0/*::reduce::cols<double, double, int>*/,
                0/*::reduce::cols<double, double, float>*/,
                ::reduce::cols<double, double, double>
            }
        };

        const func_t func = funcs[src.depth()][dst.depth()];

        if (!func)
            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");

        func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
    }
}
Beispiel #17
0
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());

    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));

    const Scalar zero = Scalar::all(0);

    cudaStream_t cudaStream = StreamAccessor::getStream(stream);

    ////////////////////////////////////////////////////////////////////////////////////////////
    // Init

    int rows = left.rows;
    int cols = left.cols;

    rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
    int levels = rthis.levels;

    // compute sizes
    AutoBuffer<int> buf(levels * 3);
    int* cols_pyr = buf;
    int* rows_pyr = cols_pyr + levels;
    int* nr_plane_pyr = rows_pyr + levels;

    cols_pyr[0]     = cols;
    rows_pyr[0]     = rows;
    nr_plane_pyr[0] = rthis.nr_plane;

    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i]     = cols_pyr[i-1] / 2;
        rows_pyr[i]     = rows_pyr[i-1] / 2;
        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
    }


    GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;


    //allocate buffers
    int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
    buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
    buffers_count += 1; //  data_cost_selected
    mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type);

    data_cost          = mbuf.rowRange(0, rows * rthis.nr_plane * 2);
    data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane);

    for(int k = 0; k < 2; ++k) // in/out
    {
        GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows);
        GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);

        GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
        for(int _r = 0; _r < 5; ++_r)
        {
            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
            assert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
        }
    };

    size_t elem_step = mbuf.step / sizeof(T);

    Size temp_size = data_cost.size();
    if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)
        temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp);

    temp.create(temp_size, DataType<T>::type);

    ////////////////////////////////////////////////////////////////////////////
    // Compute

    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    if (stream)
    {
        stream.enqueueMemSet(l[0], zero);
        stream.enqueueMemSet(d[0], zero);
        stream.enqueueMemSet(r[0], zero);
        stream.enqueueMemSet(u[0], zero);

        stream.enqueueMemSet(l[1], zero);
        stream.enqueueMemSet(d[1], zero);
        stream.enqueueMemSet(r[1], zero);
        stream.enqueueMemSet(u[1], zero);

        stream.enqueueMemSet(data_cost, zero);
        stream.enqueueMemSet(data_cost_selected, zero);
    }
    else
    {
        l[0].setTo(zero);
        d[0].setTo(zero);
        r[0].setTo(zero);
        u[0].setTo(zero);

        l[1].setTo(zero);
        d[1].setTo(zero);
        r[1].setTo(zero);
        u[1].setTo(zero);

        data_cost.setTo(zero);
        data_cost_selected.setTo(zero);
    }

    int cur_idx = 0;

    for (int i = levels - 1; i >= 0; i--)
    {
        if (i == levels - 1)
        {
            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
        }
        else
        {
            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step,
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);

            int new_idx = (cur_idx + 1) & 1;

            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i],
                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);

            cur_idx = new_idx;
        }

        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step,
                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
    }

    if (disp.empty())
        disp.create(rows, cols, CV_16S);

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

    if (stream)
        stream.enqueueMemSet(out, zero);
    else
        out.setTo(zero);

    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
        if (stream)
            stream.enqueueConvert(out, disp, disp.type());
        else
            out.convertTo(disp, disp.type());
    }
}
Beispiel #18
0
void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
{
    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
    CV_Assert( I0.size() == I1.size() );
    CV_Assert( I0.type() == I1.type() );
    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
    CV_Assert( nscales > 0 );

    // allocate memory for the pyramid structure
    I0s.resize(nscales);
    I1s.resize(nscales);
    u1s.resize(nscales);
    u2s.resize(nscales);

    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);

    if (!useInitialFlow)
    {
        flowx.create(I0.size(), CV_32FC1);
        flowy.create(I0.size(), CV_32FC1);
    }

    u1s[0] = flowx;
    u2s[0] = flowy;

    I1x_buf.create(I0.size(), CV_32FC1);
    I1y_buf.create(I0.size(), CV_32FC1);

    I1w_buf.create(I0.size(), CV_32FC1);
    I1wx_buf.create(I0.size(), CV_32FC1);
    I1wy_buf.create(I0.size(), CV_32FC1);

    grad_buf.create(I0.size(), CV_32FC1);
    rho_c_buf.create(I0.size(), CV_32FC1);

    p11_buf.create(I0.size(), CV_32FC1);
    p12_buf.create(I0.size(), CV_32FC1);
    p21_buf.create(I0.size(), CV_32FC1);
    p22_buf.create(I0.size(), CV_32FC1);

    diff_buf.create(I0.size(), CV_32FC1);

    // create the scales
    for (int s = 1; s < nscales; ++s)
    {
        gpu::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
        gpu::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);

        if (I0s[s].cols < 16 || I0s[s].rows < 16)
        {
            nscales = s;
            break;
        }

        if (useInitialFlow)
        {
            gpu::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
            gpu::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);

            gpu::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
            gpu::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
        }
        else
        {
            u1s[s].create(I0s[s].size(), CV_32FC1);
            u2s[s].create(I0s[s].size(), CV_32FC1);
        }
    }

    if (!useInitialFlow)
    {
        u1s[nscales-1].setTo(Scalar::all(0));
        u2s[nscales-1].setTo(Scalar::all(0));
    }

    // pyramidal structure for computing the optical flow
    for (int s = nscales - 1; s >= 0; --s)
    {
        // compute the optical flow at the current scale
        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);

        // if this was the last scale, finish now
        if (s == 0)
            break;

        // otherwise, upsample the optical flow

        // zoom the optical flow for the next finer scale
        gpu::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
        gpu::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());

        // scale the optical flow with the appropriate zoom factor
        gpu::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
        gpu::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
    }
}