Esempio n. 1
0
Array<T> morph(const Array<T> &in, const Array<T> &mask) {
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");

    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(
        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));

    if (isDilation)
        kernel::morph<T, true>(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
Esempio n. 2
0
void nearest_neighbour(Param<uint> idx,
                       Param<To> dist,
                       CParam<T> query,
                       CParam<T> train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    const unsigned feat_len = query.dims[dist_dim];
    const To max_dist = maxval<To>();

    if (feat_len > THREADS) {
        CUDA_NOT_SUPPORTED();
    }

    const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

    const unsigned nquery = query.dims[sample_dim];
    const unsigned ntrain = train.dims[sample_dim];

    dim3 threads(THREADS, 1);
    dim3 blocks(divup(ntrain, threads.x), 1);

    // Determine maximum feat_len capable of using shared memory (faster)
    int device = getActiveDeviceId();
    cudaDeviceProp prop = getDeviceProp(device);
    size_t avail_smem = prop.sharedMemPerBlock;
    size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
    size_t strain_sz = threads.x * feat_len * sizeof(T);
    bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false;
    unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef;

    unsigned nblk = blocks.x;

    auto d_blk_idx  = memAlloc<unsigned>(nblk * nquery);
    auto d_blk_dist = memAlloc<To>(nblk * nquery);

    // For each query vector, find training vector with smallest Hamming
    // distance per CUDA block
    if (use_shmem) {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    else {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    POST_LAUNCH_CHECK();

    threads = dim3(32, 8);
    blocks = dim3(nquery, 1);

    // Reduce all smallest Hamming distances from each block and store final
    // best match
    CUDA_LAUNCH(select_matches, blocks, threads,
                idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist);
    POST_LAUNCH_CHECK();

}