Beispiel #1
0
        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
        {
            typedef TransformFunctorTraits<UnOp> ft;
            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;

            const int x = threadIdx.x + blockIdx.x * blockDim.x;
            const int y = threadIdx.y + blockIdx.y * blockDim.y;
            const int x_shifted = x * ft::smart_shift;

            if (y < src_.rows)
            {
                const T* src = src_.ptr(y);
                D* dst = dst_.ptr(y);

                if (x_shifted + ft::smart_shift - 1 < src_.cols)
                {
                    const read_type src_n_el = ((const read_type*)src)[x];
                    write_type dst_n_el = ((const write_type*)dst)[x];

                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);

                    ((write_type*)dst)[x] = dst_n_el;
                }
                else
                {
                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
                    {
                        if (mask(y, real_x))
                            dst[real_x] = op(src[real_x]);
                    }
                }
            }
        }
        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
            const Mask mask, const BinOp op)
        {
            typedef TransformFunctorTraits<BinOp> ft;
            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;

            const int x = threadIdx.x + blockIdx.x * blockDim.x;
            const int y = threadIdx.y + blockIdx.y * blockDim.y;
            const int x_shifted = x * ft::smart_shift;

            if (y < src1_.rows)
            {
                const T1* src1 = src1_.ptr(y);
                const T2* src2 = src2_.ptr(y);
                D* dst = dst_.ptr(y);

                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
                {
                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];

                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
                }
                else
                {
                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
                    {
                        if (mask(y, real_x))
                            dst[real_x] = op(src1[real_x], src2[real_x]);
                    }
                }
            }
        }
Beispiel #3
0
        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;

            if (x < src.cols && y < src.rows && mask(y, x))
            {
                dst.ptr(y)[x] = op(src.ptr(y)[x]);
            }
        }
Beispiel #4
0
        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
            const Mask mask, const BinOp op)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
            const int y = blockDim.y * blockIdx.y + threadIdx.y;

            if (x < src1.cols && y < src1.rows && mask(y, x))
            {
                const T1 src1_data = src1.ptr(y)[x];
                const T2 src2_data = src2.ptr(y)[x];
                dst.ptr(y)[x] = op(src1_data, src2_data);
            }
        }
Beispiel #5
0
 template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
 {
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
     cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
 }
Beispiel #6
0
    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
    {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            const int BLOCK_DIM_X = 32;
            const int BLOCK_DIM_Y = 8;
            const int PATCH_PER_BLOCK = 4;
            const int HALO_SIZE = 1;
        #else
            const int BLOCK_DIM_X = 32;
            const int BLOCK_DIM_Y = 4;
            const int PATCH_PER_BLOCK = 4;
            const int HALO_SIZE = 1;
        #endif

        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

        __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];

        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;

        if (y >= src.rows)
            return;

        const T* src_row = src.ptr(y);

        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;

        if (blockIdx.x > 0)
        {
            //Load left halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
        }
        else
        {
            //Load left halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
        }

        if (blockIdx.x + 2 < gridDim.x)
        {
            //Load main data
            #pragma unroll
            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);

            //Load right halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
        }
        else
        {
            //Load main data
            #pragma unroll
            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));

            //Load right halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
        }

        __syncthreads();

        #pragma unroll
        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
        {
            const int x = xStart + j * BLOCK_DIM_X;

            if (x < src.cols)
            {
                sum_t sum = VecTraits<sum_t>::all(0);

                #pragma unroll
                for (int k = 0; k < KSIZE; ++k)
                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];

                dst(y, x) = saturate_cast<D>(sum);
            }
        }
    }
Beispiel #7
0
    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
    {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            const int BLOCK_DIM_X = 16;
            const int BLOCK_DIM_Y = 16;
            const int PATCH_PER_BLOCK = 4;
            const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
        #else
            const int BLOCK_DIM_X = 16;
            const int BLOCK_DIM_Y = 8;
            const int PATCH_PER_BLOCK = 2;
            const int HALO_SIZE = 2;
        #endif

        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

        __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];

        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;

        if (x >= src.cols)
            return;

        const T* src_col = src.ptr() + x;

        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;

        if (blockIdx.y > 0)
        {
            //Upper halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
        }
        else
        {
            //Upper halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
        }

        if (blockIdx.y + 2 < gridDim.y)
        {
            //Main data
            #pragma unroll
            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));

            //Lower halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
        }
        else
        {
            //Main data
            #pragma unroll
            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));

            //Lower halo
            #pragma unroll
            for (int j = 0; j < HALO_SIZE; ++j)
                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
        }

        __syncthreads();

        #pragma unroll
        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
        {
            const int y = yStart + j * BLOCK_DIM_Y;

            if (y < src.rows)
            {
                sum_t sum = VecTraits<sum_t>::all(0);

                #pragma unroll
                for (int k = 0; k < KSIZE; ++k)
                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];

                dst(y, x) = saturate_cast<D>(sum);
            }
        }
    }