Array<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) CUDA_NOT_SUPPORTED("Rectangular masks are not supported"); if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported"); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync( kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getActiveStream())); if (isDilation) kernel::morph<T, true>(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out; }
void nearest_neighbour(Param<uint> idx, Param<To> dist, CParam<T> query, CParam<T> train, const dim_t dist_dim, const unsigned n_dist) { const unsigned feat_len = query.dims[dist_dim]; const To max_dist = maxval<To>(); if (feat_len > THREADS) { CUDA_NOT_SUPPORTED(); } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.dims[sample_dim]; const unsigned ntrain = train.dims[sample_dim]; dim3 threads(THREADS, 1); dim3 blocks(divup(ntrain, threads.x), 1); // Determine maximum feat_len capable of using shared memory (faster) int device = getActiveDeviceId(); cudaDeviceProp prop = getDeviceProp(device); size_t avail_smem = prop.sharedMemPerBlock; size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t strain_sz = threads.x * feat_len * sizeof(T); bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false; unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef; unsigned nblk = blocks.x; auto d_blk_idx = memAlloc<unsigned>(nblk * nquery); auto d_blk_dist = memAlloc<To>(nblk * nquery); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (use_shmem) { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } else { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } POST_LAUNCH_CHECK(); threads = dim3(32, 8); blocks = dim3(nquery, 1); // Reduce all smallest Hamming distances from each block and store final // best match CUDA_LAUNCH(select_matches, blocks, threads, idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist); POST_LAUNCH_CHECK(); }