Esempio n. 1
0
void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint iter)
{
    static dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);

    const int bCount   = (is_color ? 1 : in.dims[2]);
    const int channels = (is_color ? in.dims[2] : 1); // this has to be 3 for color images

    dim3 blocks(blk_x * bCount, blk_y * in.dims[3]);

    // clamp spatical and chromatic sigma's
    float space_     = std::min(11.5f, s_sigma);
    int radius  = std::max((int)(space_ * 1.5f), 1);
    int padding = 2*radius+1;
    const float cvar = c_sigma*c_sigma;
    size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T);

    if (is_color)
        CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size,
                         out, in, space_, radius, cvar, iter, blk_x, blk_y);
    else
        CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size,
                         out, in, space_, radius, cvar, iter, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
Esempio n. 2
0
void morph(Param<T> out, CParam<T> in, int windLen)
{
    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);
    // launch batch * blk_x blocks along x dimension
    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);

    // calculate shared memory size
    int halo      = windLen/2;
    int padding   = 2*halo;
    int shrdLen   = kernel::THREADS_X + padding + 1; // +1 for to avoid bank conflicts
    int shrdSize  = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T);

    switch(windLen) {
        case  3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
    }

    POST_LAUNCH_CHECK();
}
Esempio n. 3
0
void all_distances(Param<To> dist,
                       CParam<T> query,
                       CParam<T> train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    const unsigned feat_len = query.dims[dist_dim];
    const unsigned max_kern_feat_len = min(THREADS, feat_len);
    const To max_dist = maxval<To>();

    const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

    const unsigned ntrain = train.dims[sample_dim];

    dim3 threads(THREADS, 1);
    dim3 blocks(divup(ntrain, threads.x), 1);

    // Determine maximum feat_len capable of using shared memory (faster)
    int device          = getActiveDeviceId();
    cudaDeviceProp prop = getDeviceProp(device);
    size_t avail_smem   = prop.sharedMemPerBlock;
    size_t smem_predef  = 2 * THREADS * sizeof(unsigned) + max_kern_feat_len * sizeof(T);
    size_t strain_sz    = threads.x * max_kern_feat_len * sizeof(T);
    bool use_shmem      = (avail_smem >= (smem_predef + strain_sz)) ? true : false;
    unsigned smem_sz    = (use_shmem) ? smem_predef + strain_sz : smem_predef;

    // For each query vector, find training vector with smallest Hamming
    // distance per CUDA block
    for(int feat_offset=0; feat_offset<feat_len; feat_offset+=THREADS) {
        if (use_shmem) {
            CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,true>), blocks, threads, smem_sz,
                             dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset);
        } else {
            CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,false>), blocks, threads, smem_sz,
                             dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset);
        }
    }
    POST_LAUNCH_CHECK();
}
Esempio n. 4
0
void susan_responses(T* out, const T* in, const unsigned idim0,
                     const unsigned idim1, const int radius, const float t,
                     const float g, const unsigned edge) {
    dim3 threads(BLOCK_X, BLOCK_Y);
    dim3 blocks(divup(idim0 - edge * 2, BLOCK_X),
                divup(idim1 - edge * 2, BLOCK_Y));
    const size_t SMEM_SIZE =
        (BLOCK_X + 2 * radius) * (BLOCK_Y + 2 * radius) * sizeof(T);

    CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE, out, in,
                     idim0, idim1, radius, t, g, edge);

    POST_LAUNCH_CHECK();
}
Esempio n. 5
0
void morph3d(Param<T> out, CParam<T> in, int windLen)
{
    dim3 threads(kernel::CUBE_X, kernel::CUBE_Y, kernel::CUBE_Z);

    int blk_x = divup(in.dims[0], CUBE_X);
    int blk_y = divup(in.dims[1], CUBE_Y);
    int blk_z = divup(in.dims[2], CUBE_Z);
    dim3 blocks(blk_x * in.dims[3], blk_y, blk_z);

    // calculate shared memory size
    int halo      = windLen/2;
    int padding   = 2*halo;
    int shrdLen   = kernel::CUBE_X + padding + 1; // +1 for to avoid bank conflicts
    int shrdSize  = shrdLen * (kernel::CUBE_Y + padding) * (kernel::CUBE_Z + padding) * sizeof(T);

    switch(windLen) {
        case  3: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x); break;
        case  5: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x); break;
        case  7: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x); break;
        default: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x); break;
    }

    POST_LAUNCH_CHECK();
}
void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval)
{
    dim3 threads(kernel::THREADS_X, 1);

    int nElems = in.dims[0] * in.dims[1];
    int blk_x  = divup(nElems, THRD_LOAD*THREADS_X);

    dim3 blocks(blk_x * in.dims[2], in.dims[3]);

    // If nbins > MAX_BINS, we are using global memory so smem_size can be 0;
    int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0;

    CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
            out, in, nElems, nbins, minval, maxval, blk_x);

    POST_LAUNCH_CHECK();
}
Esempio n. 7
0
void bilateral(Param<outType> out, CParam<inType> in, float s_sigma, float c_sigma)
{
    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);

    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);

    // calculate shared memory size
    int radius = (int)std::max(s_sigma * 1.5f, 1.f);
    int num_shrd_elems    = (THREADS_X + 2 * radius) * (THREADS_Y + 2 * radius);
    int num_gauss_elems   = (2 * radius + 1)*(2 * radius + 1);
    int total_shrd_size   = sizeof(outType) * (num_shrd_elems + num_gauss_elems);

    CUDA_LAUNCH_SMEM((bilateralKernel<inType, outType>), blocks, threads, total_shrd_size,
        out, in, s_sigma, c_sigma, num_shrd_elems, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
Esempio n. 8
0
void nearest_neighbour(Param<uint> idx,
                       Param<To> dist,
                       CParam<T> query,
                       CParam<T> train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    const unsigned feat_len = query.dims[dist_dim];
    const To max_dist = maxval<To>();

    if (feat_len > THREADS) {
        CUDA_NOT_SUPPORTED();
    }

    const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

    const unsigned nquery = query.dims[sample_dim];
    const unsigned ntrain = train.dims[sample_dim];

    dim3 threads(THREADS, 1);
    dim3 blocks(divup(ntrain, threads.x), 1);

    // Determine maximum feat_len capable of using shared memory (faster)
    int device = getActiveDeviceId();
    cudaDeviceProp prop = getDeviceProp(device);
    size_t avail_smem = prop.sharedMemPerBlock;
    size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
    size_t strain_sz = threads.x * feat_len * sizeof(T);
    bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false;
    unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef;

    unsigned nblk = blocks.x;

    auto d_blk_idx  = memAlloc<unsigned>(nblk * nquery);
    auto d_blk_dist = memAlloc<To>(nblk * nquery);

    // For each query vector, find training vector with smallest Hamming
    // distance per CUDA block
    if (use_shmem) {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    else {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    POST_LAUNCH_CHECK();

    threads = dim3(32, 8);
    blocks = dim3(nquery, 1);

    // Reduce all smallest Hamming distances from each block and store final
    // best match
    CUDA_LAUNCH(select_matches, blocks, threads,
                idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist);
    POST_LAUNCH_CHECK();

}
Esempio n. 9
0
int computeH(
    Param<T> bestH,
    Param<T> H,
    Param<float> err,
    CParam<float> x_src,
    CParam<float> y_src,
    CParam<float> x_dst,
    CParam<float> y_dst,
    CParam<float> rnd,
    const unsigned iterations,
    const unsigned nsamples,
    const float inlier_thr,
    const af_homography_type htype)
{
    dim3 threads(16, 16);
    dim3 blocks(1, divup(iterations, threads.y));

    // Build linear system and solve SVD
    size_t ls_shared_sz = threads.x * 81 * 2 * sizeof(T);
    CUDA_LAUNCH_SMEM((buildLinearSystem<T>), blocks, threads, ls_shared_sz,
                H, x_src, y_src, x_dst, y_dst, rnd, iterations);
    POST_LAUNCH_CHECK();

    threads = dim3(256);
    blocks = dim3(divup(iterations, threads.x));

    // Allocate some temporary buffers
    Param<unsigned> idx, inliers;
    Param<float> median;
    inliers.dims[0] = (htype == AF_HOMOGRAPHY_RANSAC) ? blocks.x : divup(nsamples, threads.x);
    inliers.strides[0] = 1;
    idx.dims[0] = median.dims[0] = blocks.x;
    idx.strides[0] = median.strides[0] = 1;
    for (int k = 1; k < 4; k++) {
        inliers.dims[k] = 1;
        inliers.strides[k] = inliers.dims[k-1] * inliers.strides[k-1];
        idx.dims[k] = median.dims[k] = 1;
        idx.strides[k] = median.strides[k] = idx.dims[k-1] * idx.strides[k-1];
    }
    idx.ptr = memAlloc<unsigned>(idx.dims[3] * idx.strides[3]);
    inliers.ptr = memAlloc<unsigned>(inliers.dims[3] * inliers.strides[3]);
    if (htype == AF_HOMOGRAPHY_LMEDS)
        median.ptr = memAlloc<float>(median.dims[3] * median.strides[3]);

    // Compute (and for RANSAC, evaluate) homographies
    CUDA_LAUNCH((computeEvalHomography<T>), blocks, threads,
                 inliers, idx, H, err, x_src, y_src, x_dst, y_dst,
                 rnd, iterations, nsamples, inlier_thr, htype);
    POST_LAUNCH_CHECK();

    unsigned inliersH, idxH;
    if (htype == AF_HOMOGRAPHY_LMEDS) {
        // TODO: Improve this sorting, if the number of iterations is
        // sufficiently large, this can be *very* slow
        kernel::sort0<float, true>(err);

        unsigned minIdx;
        float minMedian;

        // Compute median of every iteration
        CUDA_LAUNCH((computeMedian), blocks, threads,
                    median, idx, err, iterations);
        POST_LAUNCH_CHECK();

        // Reduce medians, only in case iterations > 256
        if (blocks.x > 1) {
            blocks = dim3(1);

            float* finalMedian = memAlloc<float>(1);
            unsigned* finalIdx = memAlloc<unsigned>(1);

            CUDA_LAUNCH((findMinMedian), blocks, threads,
                        finalMedian, finalIdx, median, idx);
            POST_LAUNCH_CHECK();

            CUDA_CHECK(cudaMemcpyAsync(&minMedian, finalMedian, sizeof(float),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
            CUDA_CHECK(cudaMemcpyAsync(&minIdx, finalIdx, sizeof(unsigned),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));

            memFree(finalMedian);
            memFree(finalIdx);
        } else {
            CUDA_CHECK(cudaMemcpyAsync(&minMedian, median.ptr, sizeof(float),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
            CUDA_CHECK(cudaMemcpyAsync(&minIdx, idx.ptr, sizeof(unsigned),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
        }

        // Copy best homography to output
        CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + minIdx * 9, 9*sizeof(T),
                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));

        blocks = dim3(divup(nsamples, threads.x));
        // sync stream for the device to host copies to be visible for
        // the subsequent kernel launch
        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));

        CUDA_LAUNCH((computeLMedSInliers<T>), blocks, threads,
                    inliers, bestH, x_src, y_src, x_dst, y_dst,
                    minMedian, nsamples);
        POST_LAUNCH_CHECK();

        // Adds up the total number of inliers
        Param<unsigned> totalInliers;
        for (int k = 0; k < 4; k++)
            totalInliers.dims[k] = totalInliers.strides[k] = 1;
        totalInliers.ptr = memAlloc<unsigned>(1);

        kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0);

        CUDA_CHECK(cudaMemcpyAsync(&inliersH, totalInliers.ptr, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));

        memFree(totalInliers.ptr);
        memFree(median.ptr);
    } else if (htype == AF_HOMOGRAPHY_RANSAC) {
        Param<unsigned> bestInliers, bestIdx;
        for (int k = 0; k < 4; k++) {
            bestInliers.dims[k] = bestIdx.dims[k] = 1;
            bestInliers.strides[k] = bestIdx.strides[k] = 1;
        }
        bestInliers.ptr = memAlloc<unsigned>(1);
        bestIdx.ptr = memAlloc<unsigned>(1);

        kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.ptr, inliers, 0);

        unsigned blockIdx;
        CUDA_CHECK(cudaMemcpyAsync(&blockIdx, bestIdx.ptr, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));

        // Copies back index and number of inliers of best homography estimation
        CUDA_CHECK(cudaMemcpyAsync(&idxH, idx.ptr+blockIdx, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
        CUDA_CHECK(cudaMemcpyAsync(&inliersH, bestInliers.ptr, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
        CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + idxH * 9, 9*sizeof(T),
                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));

        memFree(bestInliers.ptr);
        memFree(bestIdx.ptr);
    }

    memFree(inliers.ptr);
    memFree(idx.ptr);
    // sync stream for the device to host copies to be visible for
    // the subsequent kernel launch
    CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));

    return (int)inliersH;
}