void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint iter) { static dim3 threads(kernel::THREADS_X, kernel::THREADS_Y); int blk_x = divup(in.dims[0], THREADS_X); int blk_y = divup(in.dims[1], THREADS_Y); const int bCount = (is_color ? 1 : in.dims[2]); const int channels = (is_color ? in.dims[2] : 1); // this has to be 3 for color images dim3 blocks(blk_x * bCount, blk_y * in.dims[3]); // clamp spatical and chromatic sigma's float space_ = std::min(11.5f, s_sigma); int radius = std::max((int)(space_ * 1.5f), 1); int padding = 2*radius+1; const float cvar = c_sigma*c_sigma; size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T); if (is_color) CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size, out, in, space_, radius, cvar, iter, blk_x, blk_y); else CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size, out, in, space_, radius, cvar, iter, blk_x, blk_y); POST_LAUNCH_CHECK(); }
void morph(Param<T> out, CParam<T> in, int windLen) { dim3 threads(kernel::THREADS_X, kernel::THREADS_Y); int blk_x = divup(in.dims[0], THREADS_X); int blk_y = divup(in.dims[1], THREADS_Y); // launch batch * blk_x blocks along x dimension dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); // calculate shared memory size int halo = windLen/2; int padding = 2*halo; int shrdLen = kernel::THREADS_X + padding + 1; // +1 for to avoid bank conflicts int shrdSize = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T); switch(windLen) { case 3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; } POST_LAUNCH_CHECK(); }
void all_distances(Param<To> dist, CParam<T> query, CParam<T> train, const dim_t dist_dim, const unsigned n_dist) { const unsigned feat_len = query.dims[dist_dim]; const unsigned max_kern_feat_len = min(THREADS, feat_len); const To max_dist = maxval<To>(); const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned ntrain = train.dims[sample_dim]; dim3 threads(THREADS, 1); dim3 blocks(divup(ntrain, threads.x), 1); // Determine maximum feat_len capable of using shared memory (faster) int device = getActiveDeviceId(); cudaDeviceProp prop = getDeviceProp(device); size_t avail_smem = prop.sharedMemPerBlock; size_t smem_predef = 2 * THREADS * sizeof(unsigned) + max_kern_feat_len * sizeof(T); size_t strain_sz = threads.x * max_kern_feat_len * sizeof(T); bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false; unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef; // For each query vector, find training vector with smallest Hamming // distance per CUDA block for(int feat_offset=0; feat_offset<feat_len; feat_offset+=THREADS) { if (use_shmem) { CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,true>), blocks, threads, smem_sz, dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset); } else { CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,false>), blocks, threads, smem_sz, dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset); } } POST_LAUNCH_CHECK(); }
void susan_responses(T* out, const T* in, const unsigned idim0, const unsigned idim1, const int radius, const float t, const float g, const unsigned edge) { dim3 threads(BLOCK_X, BLOCK_Y); dim3 blocks(divup(idim0 - edge * 2, BLOCK_X), divup(idim1 - edge * 2, BLOCK_Y)); const size_t SMEM_SIZE = (BLOCK_X + 2 * radius) * (BLOCK_Y + 2 * radius) * sizeof(T); CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE, out, in, idim0, idim1, radius, t, g, edge); POST_LAUNCH_CHECK(); }
void morph3d(Param<T> out, CParam<T> in, int windLen) { dim3 threads(kernel::CUBE_X, kernel::CUBE_Y, kernel::CUBE_Z); int blk_x = divup(in.dims[0], CUBE_X); int blk_y = divup(in.dims[1], CUBE_Y); int blk_z = divup(in.dims[2], CUBE_Z); dim3 blocks(blk_x * in.dims[3], blk_y, blk_z); // calculate shared memory size int halo = windLen/2; int padding = 2*halo; int shrdLen = kernel::CUBE_X + padding + 1; // +1 for to avoid bank conflicts int shrdSize = shrdLen * (kernel::CUBE_Y + padding) * (kernel::CUBE_Z + padding) * sizeof(T); switch(windLen) { case 3: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x); break; case 5: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x); break; case 7: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x); break; default: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x); break; } POST_LAUNCH_CHECK(); }
void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval) { dim3 threads(kernel::THREADS_X, 1); int nElems = in.dims[0] * in.dims[1]; int blk_x = divup(nElems, THRD_LOAD*THREADS_X); dim3 blocks(blk_x * in.dims[2], in.dims[3]); // If nbins > MAX_BINS, we are using global memory so smem_size can be 0; int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0; CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size, out, in, nElems, nbins, minval, maxval, blk_x); POST_LAUNCH_CHECK(); }
void bilateral(Param<outType> out, CParam<inType> in, float s_sigma, float c_sigma) { dim3 threads(kernel::THREADS_X, kernel::THREADS_Y); int blk_x = divup(in.dims[0], THREADS_X); int blk_y = divup(in.dims[1], THREADS_Y); dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); // calculate shared memory size int radius = (int)std::max(s_sigma * 1.5f, 1.f); int num_shrd_elems = (THREADS_X + 2 * radius) * (THREADS_Y + 2 * radius); int num_gauss_elems = (2 * radius + 1)*(2 * radius + 1); int total_shrd_size = sizeof(outType) * (num_shrd_elems + num_gauss_elems); CUDA_LAUNCH_SMEM((bilateralKernel<inType, outType>), blocks, threads, total_shrd_size, out, in, s_sigma, c_sigma, num_shrd_elems, blk_x, blk_y); POST_LAUNCH_CHECK(); }
void nearest_neighbour(Param<uint> idx, Param<To> dist, CParam<T> query, CParam<T> train, const dim_t dist_dim, const unsigned n_dist) { const unsigned feat_len = query.dims[dist_dim]; const To max_dist = maxval<To>(); if (feat_len > THREADS) { CUDA_NOT_SUPPORTED(); } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.dims[sample_dim]; const unsigned ntrain = train.dims[sample_dim]; dim3 threads(THREADS, 1); dim3 blocks(divup(ntrain, threads.x), 1); // Determine maximum feat_len capable of using shared memory (faster) int device = getActiveDeviceId(); cudaDeviceProp prop = getDeviceProp(device); size_t avail_smem = prop.sharedMemPerBlock; size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t strain_sz = threads.x * feat_len * sizeof(T); bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false; unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef; unsigned nblk = blocks.x; auto d_blk_idx = memAlloc<unsigned>(nblk * nquery); auto d_blk_dist = memAlloc<To>(nblk * nquery); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (use_shmem) { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } else { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } POST_LAUNCH_CHECK(); threads = dim3(32, 8); blocks = dim3(nquery, 1); // Reduce all smallest Hamming distances from each block and store final // best match CUDA_LAUNCH(select_matches, blocks, threads, idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist); POST_LAUNCH_CHECK(); }
int computeH( Param<T> bestH, Param<T> H, Param<float> err, CParam<float> x_src, CParam<float> y_src, CParam<float> x_dst, CParam<float> y_dst, CParam<float> rnd, const unsigned iterations, const unsigned nsamples, const float inlier_thr, const af_homography_type htype) { dim3 threads(16, 16); dim3 blocks(1, divup(iterations, threads.y)); // Build linear system and solve SVD size_t ls_shared_sz = threads.x * 81 * 2 * sizeof(T); CUDA_LAUNCH_SMEM((buildLinearSystem<T>), blocks, threads, ls_shared_sz, H, x_src, y_src, x_dst, y_dst, rnd, iterations); POST_LAUNCH_CHECK(); threads = dim3(256); blocks = dim3(divup(iterations, threads.x)); // Allocate some temporary buffers Param<unsigned> idx, inliers; Param<float> median; inliers.dims[0] = (htype == AF_HOMOGRAPHY_RANSAC) ? blocks.x : divup(nsamples, threads.x); inliers.strides[0] = 1; idx.dims[0] = median.dims[0] = blocks.x; idx.strides[0] = median.strides[0] = 1; for (int k = 1; k < 4; k++) { inliers.dims[k] = 1; inliers.strides[k] = inliers.dims[k-1] * inliers.strides[k-1]; idx.dims[k] = median.dims[k] = 1; idx.strides[k] = median.strides[k] = idx.dims[k-1] * idx.strides[k-1]; } idx.ptr = memAlloc<unsigned>(idx.dims[3] * idx.strides[3]); inliers.ptr = memAlloc<unsigned>(inliers.dims[3] * inliers.strides[3]); if (htype == AF_HOMOGRAPHY_LMEDS) median.ptr = memAlloc<float>(median.dims[3] * median.strides[3]); // Compute (and for RANSAC, evaluate) homographies CUDA_LAUNCH((computeEvalHomography<T>), blocks, threads, inliers, idx, H, err, x_src, y_src, x_dst, y_dst, rnd, iterations, nsamples, inlier_thr, htype); POST_LAUNCH_CHECK(); unsigned inliersH, idxH; if (htype == AF_HOMOGRAPHY_LMEDS) { // TODO: Improve this sorting, if the number of iterations is // sufficiently large, this can be *very* slow kernel::sort0<float, true>(err); unsigned minIdx; float minMedian; // Compute median of every iteration CUDA_LAUNCH((computeMedian), blocks, threads, median, idx, err, iterations); POST_LAUNCH_CHECK(); // Reduce medians, only in case iterations > 256 if (blocks.x > 1) { blocks = dim3(1); float* finalMedian = memAlloc<float>(1); unsigned* finalIdx = memAlloc<unsigned>(1); CUDA_LAUNCH((findMinMedian), blocks, threads, finalMedian, finalIdx, median, idx); POST_LAUNCH_CHECK(); CUDA_CHECK(cudaMemcpyAsync(&minMedian, finalMedian, sizeof(float), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(&minIdx, finalIdx, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); memFree(finalMedian); memFree(finalIdx); } else { CUDA_CHECK(cudaMemcpyAsync(&minMedian, median.ptr, sizeof(float), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(&minIdx, idx.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); } // Copy best homography to output CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + minIdx * 9, 9*sizeof(T), cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); blocks = dim3(divup(nsamples, threads.x)); // sync stream for the device to host copies to be visible for // the subsequent kernel launch CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); CUDA_LAUNCH((computeLMedSInliers<T>), blocks, threads, inliers, bestH, x_src, y_src, x_dst, y_dst, minMedian, nsamples); POST_LAUNCH_CHECK(); // Adds up the total number of inliers Param<unsigned> totalInliers; for (int k = 0; k < 4; k++) totalInliers.dims[k] = totalInliers.strides[k] = 1; totalInliers.ptr = memAlloc<unsigned>(1); kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0); CUDA_CHECK(cudaMemcpyAsync(&inliersH, totalInliers.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); memFree(totalInliers.ptr); memFree(median.ptr); } else if (htype == AF_HOMOGRAPHY_RANSAC) { Param<unsigned> bestInliers, bestIdx; for (int k = 0; k < 4; k++) { bestInliers.dims[k] = bestIdx.dims[k] = 1; bestInliers.strides[k] = bestIdx.strides[k] = 1; } bestInliers.ptr = memAlloc<unsigned>(1); bestIdx.ptr = memAlloc<unsigned>(1); kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.ptr, inliers, 0); unsigned blockIdx; CUDA_CHECK(cudaMemcpyAsync(&blockIdx, bestIdx.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); // Copies back index and number of inliers of best homography estimation CUDA_CHECK(cudaMemcpyAsync(&idxH, idx.ptr+blockIdx, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(&inliersH, bestInliers.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + idxH * 9, 9*sizeof(T), cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); memFree(bestInliers.ptr); memFree(bestIdx.ptr); } memFree(inliers.ptr); memFree(idx.ptr); // sync stream for the device to host copies to be visible for // the subsequent kernel launch CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); return (int)inliersH; }