void transform(Param<T> out, CParam<T> in, CParam<float> tf, const bool inverse) { int nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const int ntransforms = out.dims[2] / in.dims[2]; // Copy transform to constant memory. CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); const int blocksXPerImage = blocks.x; if(nimages > TI) { int tile_images = divup(nimages, TI); nimages = TI; blocks.x = blocks.x * tile_images; } if (ntransforms > 1) { blocks.y *= ntransforms; } if(inverse) { CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads, out, in, nimages, ntransforms, blocksXPerImage); } else { CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads, out, in, nimages, ntransforms, blocksXPerImage); } POST_LAUNCH_CHECK(); }
void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims) { if (nDims==1) { const dim3 threads(THREADS, 1); /* find which dimension has non-zero # of elements */ int vDim = 0; for (int i=0; i<4; i++) { if (in.dims[i]==1) vDim++; else break; } int blks = divup(out.dims[vDim], THREADS*THRD_LOAD); dim3 blocks(blks, 1); CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim); } else { const dim3 threads(THREADS_X, THREADS_Y); int blks_x = divup(out.dims[0], threads.x); int blks_y = divup(out.dims[1], threads.y); dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]); CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y); } POST_LAUNCH_CHECK(); }
static void scan_dim_launcher(Param<To> out, Param<To> tmp, CParam<Ti> in, const uint threads_y, const uint blocks_all[4]) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_all[0] * blocks_all[2], blocks_all[1] * blocks_all[3]); uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim])); switch (threads_y) { case 8: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 8>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; case 4: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 4>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; case 2: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 2>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; case 1: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 1>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; } POST_LAUNCH_CHECK(); }
static void scan_dim_nonfinal_launcher(Param<To> out, Param<To> tmp, Param<char> tflg, Param<int> tlid, CParam<Ti> in, CParam<Tk> key, const int dim, const uint threads_y, const uint blocks_all[4], bool inclusive_scan) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_all[0] * blocks_all[2], blocks_all[1] * blocks_all[3]); uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim])); switch (threads_y) { case 8: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 8>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; case 4: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 4>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; case 2: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 2>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; case 1: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 1>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; } POST_LAUNCH_CHECK(); }
void select(Param<T> out, CParam<char> cond, CParam<T> a, CParam<T> b, int ndims) { bool is_same = true; for (int i = 0; i < 4; i++) { is_same &= (a.dims[i] == b.dims[i]); } dim3 threads(DIMX, DIMY); if (ndims == 1) { threads.x *= threads.y; threads.y = 1; } int blk_x = divup(out.dims[0], threads.x); int blk_y = divup(out.dims[1], threads.y); dim3 blocks(blk_x * out.dims[2], blk_y * out.dims[3]); if (is_same) { CUDA_LAUNCH((select_kernel<T, true>), blocks, threads, out, cond, a, b, blk_x, blk_y); } else { CUDA_LAUNCH((select_kernel<T, false>), blocks, threads, out, cond, a, b, blk_x, blk_y); } }
static void scan_first_launcher(Param<To> out, Param<To> tmp, CParam<Ti> in, const uint blocks_x, const uint blocks_y, const uint threads_x) { dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); uint lim = divup(out.dims[0], (threads_x * blocks_x)); switch (threads_x) { case 32: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 32>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; case 64: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 64>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; case 128: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 128>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; case 256: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 256>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; } POST_LAUNCH_CHECK(); }
void shift(Param<T> out, CParam<T> in, const int *sdims) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); int sdims_[4]; // Need to do this because we are mapping output to input in the kernel for(int i = 0; i < 4; i++) { // sdims_[i] will always be positive and always [0, oDims[i]]. // Negative shifts are converted to position by going the other way round sdims_[i] = -(sdims[i] % (int)out.dims[i]) + out.dims[i] * (sdims[i] > 0); assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]); } CUDA_LAUNCH((shift_kernel<T>), blocks, threads, out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3], blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void coo2dense(Param<T> output, CParam<T> values, CParam<int> rowIdx, CParam<int> colIdx) { dim3 threads(256, 1, 1); dim3 blocks(divup(output.dims[0], threads.x * reps), 1, 1); CUDA_LAUNCH((coo2dense_kernel<T>), blocks, threads, output, values, rowIdx, colIdx); POST_LAUNCH_CHECK(); }
static void identity(Param<T> out) { dim3 threads(32, 8); int blocks_x = divup(out.dims[0], threads.x); int blocks_y = divup(out.dims[1], threads.y); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y); POST_LAUNCH_CHECK(); }
void transpose(Param<T> out, CParam<T> in, const int ndims) { // dimensions passed to this function should be input dimensions // any necessary transformations and dimension related calculations are // carried out here and inside the kernel dim3 threads(kernel::THREADS_X,kernel::THREADS_Y); int blk_x = divup(in.dims[0],TILE_DIM); int blk_y = divup(in.dims[1],TILE_DIM); // launch batch * blk_x blocks along x dimension dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0) CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y); else CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y); POST_LAUNCH_CHECK(); }
void range(Param<T> out, const int dim) { dim3 threads(RANGE_TX, RANGE_TY, 1); int blocksPerMatX = divup(out.dims[0], RANGE_TILEX); int blocksPerMatY = divup(out.dims[1], RANGE_TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); CUDA_LAUNCH((range_kernel<T>), blocks, threads, out, dim, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void assign(Param<T> out, CParam<T> in, const AssignKernelParam_t& p) { const dim3 threads(THREADS_X, THREADS_Y); int blks_x = divup(in.dims[0], threads.x); int blks_y = divup(in.dims[1], threads.y); dim3 blocks(blks_x*in.dims[2], blks_y*in.dims[3]); CUDA_LAUNCH((AssignKernel<T>), blocks, threads, out, in, p, blks_x, blks_y); POST_LAUNCH_CHECK(); }
void randu(T *out, size_t elements) { int device = getActiveDeviceId(); int threads = THREADS; int blocks = divup(elements, THREADS); if (blocks > BLOCKS) blocks = BLOCKS; curandState_t *state = getcurandState(); CUDA_LAUNCH(uniform_kernel, blocks, threads, out, state, elements); POST_LAUNCH_CHECK(); }
void reduce_first_launcher(Param<To> out, CParam<Ti> in, const uint blocks_x, const uint blocks_y, const uint threads_x, bool change_nan, double nanval) { dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x); dim3 blocks(blocks_x * in.dims[2], blocks_y * in.dims[3]); uint repeat = divup(in.dims[0], (blocks_x * threads_x)); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); switch (threads_x) { case 32: CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 32>), blocks, threads, out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break; case 64: CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 64>), blocks, threads, out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break; case 128: CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 128>), blocks, threads, out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break; case 256: CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 256>), blocks, threads, out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break; } POST_LAUNCH_CHECK(); }
void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx) { dim3 threads(THREADS_X, THREADS_Y); dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]); dim_t reps = divup((wx * wy), threads.y); CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, reps); POST_LAUNCH_CHECK(); }
void reduce_dim_launcher(Param<To> out, CParam<Ti> in, const uint threads_y, const dim_t blocks_dim[4], bool change_nan, double nanval) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_dim[0] * blocks_dim[2], blocks_dim[1] * blocks_dim[3]); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); switch (threads_y) { case 8: CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 8>), blocks, threads, out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim], change_nan, scalar<To>(nanval)); break; case 4: CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 4>), blocks, threads, out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim], change_nan, scalar<To>(nanval)); break; case 2: CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 2>), blocks, threads, out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim], change_nan, scalar<To>(nanval)); break; case 1: CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 1>), blocks, threads, out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim], change_nan, scalar<To>(nanval)); break; } POST_LAUNCH_CHECK(); }
void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); CUDA_LAUNCH((iota_kernel<T>), blocks, threads, out, sdims[0], sdims[1], sdims[2], sdims[3], tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void wrap(Param<T> out, CParam<T> in, const int wx, const int wy, const int sx, const int sy, const int px, const int py, const bool is_column) { int nx = (out.dims[0] + 2 * px - wx) / sx + 1; int ny = (out.dims[1] + 2 * py - wy) / sy + 1; dim3 threads(THREADS_X, THREADS_Y); int blocks_x = divup(out.dims[0], threads.x); int blocks_y = divup(out.dims[1], threads.y); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); if (is_column) { CUDA_LAUNCH((wrap_kernel<T, true>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, ny, blocks_x, blocks_y); } else { CUDA_LAUNCH((wrap_kernel<T, false>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, ny, blocks_x, blocks_y); } }
void transpose(Param<T> out, CParam<T> in, const int ndims) { // dimensions passed to this function should be input dimensions // any necessary transformations and dimension related calculations are // carried out here and inside the kernel dim3 threads(kernel::THREADS_X,kernel::THREADS_Y); int blk_x = divup(in.dims[0],TILE_DIM); int blk_y = divup(in.dims[1],TILE_DIM); // launch batch * blk_x blocks along x dimension dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0) { CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y); } else { CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y); } POST_LAUNCH_CHECK(); }
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(in.dims[0], TX); int blocksPerMatY = divup(in.dims[1], TY); dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); CUDA_LAUNCH((gradient_kernel<T>), blocks, threads, grad0, grad1, in, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size) { const dim3 threads(THREADS_X, THREADS_Y); int blk_x = divup(in.dims[0], threads.x); int blk_y = divup(in.dims[1], threads.y); dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]); //TODO: add more cases when 5x5 and 7x7 kernels are done switch(ker_size) { case 3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break; } POST_LAUNCH_CHECK(); }
void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx) { dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0])); dim3 threads(TX, THREADS_PER_BLOCK / TX); dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]); dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256 CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, reps); POST_LAUNCH_CHECK(); }
void rotate(Param<T> out, CParam<T> in, const float theta) { const float c = cos(-theta), s = sin(-theta); float tx, ty; { const float nx = 0.5 * (in.dims[0] - 1); const float ny = 0.5 * (in.dims[1] - 1); const float mx = 0.5 * (out.dims[0] - 1); const float my = 0.5 * (out.dims[1] - 1); const float sx = (mx * c + my *-s); const float sy = (mx * s + my * c); tx = -(sx - nx); ty = -(sy - ny); } // Rounding error. Anything more than 3 decimal points wont make a diff tmat_t t; t.tmat[0] = round( c * 1000) / 1000.0f; t.tmat[1] = round(-s * 1000) / 1000.0f; t.tmat[2] = round(tx * 1000) / 1000.0f; t.tmat[3] = round( s * 1000) / 1000.0f; t.tmat[4] = round( c * 1000) / 1000.0f; t.tmat[5] = round(ty * 1000) / 1000.0f; int nimages = in.dims[2]; int nbatches = in.dims[3]; dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); const int blocksXPerImage = blocks.x; const int blocksYPerImage = blocks.y; if(nimages > TI) { int tile_images = divup(nimages, TI); nimages = TI; blocks.x = blocks.x * tile_images; } blocks.y = blocks.y * nbatches; CUDA_LAUNCH((rotate_kernel<T, method>), blocks, threads, out, in, t, nimages, nbatches, blocksXPerImage, blocksYPerImage); POST_LAUNCH_CHECK(); }
static void bcast_first_launcher(Param<To> out, CParam<To> tmp, const uint blocks_x, const uint blocks_y, const uint threads_x) { dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); uint lim = divup(out.dims[0], (threads_x * blocks_x)); CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim); POST_LAUNCH_CHECK(); }
static void bcast_dim_launcher(Param<To> out, CParam<To> tmp, const uint threads_y, const uint blocks_all[4]) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_all[0] * blocks_all[2], blocks_all[1] * blocks_all[3]); uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim])); CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads, out, tmp, blocks_all[0], blocks_all[1], blocks_all[dim], lim); POST_LAUNCH_CHECK(); }
void nonMaximal(float* x_out, float* y_out, float* resp_out, unsigned* count, const unsigned idim0, const unsigned idim1, const T * resp_in, const unsigned edge, const unsigned max_corners) { dim3 threads(BLOCK_X, BLOCK_Y); dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y)); unsigned* d_corners_found = memAlloc<unsigned>(1); CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned), cuda::getStream(cuda::getActiveDeviceId()))); CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads, x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners); POST_LAUNCH_CHECK(); CUDA_CHECK(cudaMemcpy(count, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost)); memFree(d_corners_found); }
void select_scalar(Param<T> out, CParam<char> cond, CParam<T> a, const double b, int ndims) { dim3 threads(DIMX, DIMY); if (ndims == 1) { threads.x *= threads.y; threads.y = 1; } int blk_x = divup(out.dims[0], threads.x); int blk_y = divup(out.dims[1], threads.y); dim3 blocks(blk_x * threads.x, blk_y * threads.y); CUDA_LAUNCH((select_scalar_kernel<T, flip>), blocks, threads, out, cond, a, scalar<T>(b), blk_x, blk_y); }
void iota(Param<T> out, const af::dim4 &sdims, const af::dim4 &tdims) { dim3 threads(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); CUDA_LAUNCH((iota_kernel<T>), blocks, threads, out, sdims[0], sdims[1], sdims[2], sdims[3], tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void nearest_neighbour(Param<uint> idx, Param<To> dist, CParam<T> query, CParam<T> train, const dim_t dist_dim, const unsigned n_dist) { const unsigned feat_len = query.dims[dist_dim]; const To max_dist = maxval<To>(); if (feat_len > THREADS) { CUDA_NOT_SUPPORTED(); } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.dims[sample_dim]; const unsigned ntrain = train.dims[sample_dim]; dim3 threads(THREADS, 1); dim3 blocks(divup(ntrain, threads.x), 1); // Determine maximum feat_len capable of using shared memory (faster) int device = getActiveDeviceId(); cudaDeviceProp prop = getDeviceProp(device); size_t avail_smem = prop.sharedMemPerBlock; size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t strain_sz = threads.x * feat_len * sizeof(T); bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false; unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef; unsigned nblk = blocks.x; auto d_blk_idx = memAlloc<unsigned>(nblk * nquery); auto d_blk_dist = memAlloc<To>(nblk * nquery); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (use_shmem) { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } else { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } POST_LAUNCH_CHECK(); threads = dim3(32, 8); blocks = dim3(nquery, 1); // Reduce all smallest Hamming distances from each block and store final // best match CUDA_LAUNCH(select_matches, blocks, threads, idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist); POST_LAUNCH_CHECK(); }
int computeH( Param<T> bestH, Param<T> H, Param<float> err, CParam<float> x_src, CParam<float> y_src, CParam<float> x_dst, CParam<float> y_dst, CParam<float> rnd, const unsigned iterations, const unsigned nsamples, const float inlier_thr, const af_homography_type htype) { dim3 threads(16, 16); dim3 blocks(1, divup(iterations, threads.y)); // Build linear system and solve SVD size_t ls_shared_sz = threads.x * 81 * 2 * sizeof(T); CUDA_LAUNCH_SMEM((buildLinearSystem<T>), blocks, threads, ls_shared_sz, H, x_src, y_src, x_dst, y_dst, rnd, iterations); POST_LAUNCH_CHECK(); threads = dim3(256); blocks = dim3(divup(iterations, threads.x)); // Allocate some temporary buffers Param<unsigned> idx, inliers; Param<float> median; inliers.dims[0] = (htype == AF_HOMOGRAPHY_RANSAC) ? blocks.x : divup(nsamples, threads.x); inliers.strides[0] = 1; idx.dims[0] = median.dims[0] = blocks.x; idx.strides[0] = median.strides[0] = 1; for (int k = 1; k < 4; k++) { inliers.dims[k] = 1; inliers.strides[k] = inliers.dims[k-1] * inliers.strides[k-1]; idx.dims[k] = median.dims[k] = 1; idx.strides[k] = median.strides[k] = idx.dims[k-1] * idx.strides[k-1]; } idx.ptr = memAlloc<unsigned>(idx.dims[3] * idx.strides[3]); inliers.ptr = memAlloc<unsigned>(inliers.dims[3] * inliers.strides[3]); if (htype == AF_HOMOGRAPHY_LMEDS) median.ptr = memAlloc<float>(median.dims[3] * median.strides[3]); // Compute (and for RANSAC, evaluate) homographies CUDA_LAUNCH((computeEvalHomography<T>), blocks, threads, inliers, idx, H, err, x_src, y_src, x_dst, y_dst, rnd, iterations, nsamples, inlier_thr, htype); POST_LAUNCH_CHECK(); unsigned inliersH, idxH; if (htype == AF_HOMOGRAPHY_LMEDS) { // TODO: Improve this sorting, if the number of iterations is // sufficiently large, this can be *very* slow kernel::sort0<float, true>(err); unsigned minIdx; float minMedian; // Compute median of every iteration CUDA_LAUNCH((computeMedian), blocks, threads, median, idx, err, iterations); POST_LAUNCH_CHECK(); // Reduce medians, only in case iterations > 256 if (blocks.x > 1) { blocks = dim3(1); float* finalMedian = memAlloc<float>(1); unsigned* finalIdx = memAlloc<unsigned>(1); CUDA_LAUNCH((findMinMedian), blocks, threads, finalMedian, finalIdx, median, idx); POST_LAUNCH_CHECK(); CUDA_CHECK(cudaMemcpyAsync(&minMedian, finalMedian, sizeof(float), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(&minIdx, finalIdx, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); memFree(finalMedian); memFree(finalIdx); } else { CUDA_CHECK(cudaMemcpyAsync(&minMedian, median.ptr, sizeof(float), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(&minIdx, idx.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); } // Copy best homography to output CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + minIdx * 9, 9*sizeof(T), cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); blocks = dim3(divup(nsamples, threads.x)); // sync stream for the device to host copies to be visible for // the subsequent kernel launch CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); CUDA_LAUNCH((computeLMedSInliers<T>), blocks, threads, inliers, bestH, x_src, y_src, x_dst, y_dst, minMedian, nsamples); POST_LAUNCH_CHECK(); // Adds up the total number of inliers Param<unsigned> totalInliers; for (int k = 0; k < 4; k++) totalInliers.dims[k] = totalInliers.strides[k] = 1; totalInliers.ptr = memAlloc<unsigned>(1); kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0); CUDA_CHECK(cudaMemcpyAsync(&inliersH, totalInliers.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); memFree(totalInliers.ptr); memFree(median.ptr); } else if (htype == AF_HOMOGRAPHY_RANSAC) { Param<unsigned> bestInliers, bestIdx; for (int k = 0; k < 4; k++) { bestInliers.dims[k] = bestIdx.dims[k] = 1; bestInliers.strides[k] = bestIdx.strides[k] = 1; } bestInliers.ptr = memAlloc<unsigned>(1); bestIdx.ptr = memAlloc<unsigned>(1); kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.ptr, inliers, 0); unsigned blockIdx; CUDA_CHECK(cudaMemcpyAsync(&blockIdx, bestIdx.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); // Copies back index and number of inliers of best homography estimation CUDA_CHECK(cudaMemcpyAsync(&idxH, idx.ptr+blockIdx, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(&inliersH, bestInliers.ptr, sizeof(unsigned), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + idxH * 9, 9*sizeof(T), cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); memFree(bestInliers.ptr); memFree(bestIdx.ptr); } memFree(inliers.ptr); memFree(idx.ptr); // sync stream for the device to host copies to be visible for // the subsequent kernel launch CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); return (int)inliersH; }