void transpose(Param<T> out, CParam<T> in, const bool conjugate, const bool is32multiple) { static const std::string source(transpose_cuh, transpose_cuh_len); // clang-format off auto transpose = getKernel("cuda::transpose", source, { TemplateTypename<T>(), TemplateArg(conjugate), TemplateArg(is32multiple) }, { DefineValue(TILE_DIM), DefineValue(THREADS_Y) } ); // clang-format on dim3 threads(kernel::THREADS_X, kernel::THREADS_Y); int blk_x = divup(in.dims[0], TILE_DIM); int blk_y = divup(in.dims[1], TILE_DIM); dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); const int maxBlocksY = cuda::getDeviceProp(getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); EnqueueArgs qArgs(blocks, threads, getActiveStream()); transpose(qArgs, out, in, blk_x, blk_y); POST_LAUNCH_CHECK(); }
static void scan_first_launcher(Param<To> out, Param<To> tmp, CParam<Ti> in, const uint blocks_x, const uint blocks_y, const uint threads_x) { dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); uint lim = divup(out.dims[0], (threads_x * blocks_x)); switch (threads_x) { case 32: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 32>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; case 64: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 64>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; case 128: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 128>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; case 256: CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 256>), blocks, threads, out, tmp, in, blocks_x, blocks_y, lim); break; } POST_LAUNCH_CHECK(); }
void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint iter) { static dim3 threads(kernel::THREADS_X, kernel::THREADS_Y); int blk_x = divup(in.dims[0], THREADS_X); int blk_y = divup(in.dims[1], THREADS_Y); const int bCount = (is_color ? 1 : in.dims[2]); const int channels = (is_color ? in.dims[2] : 1); // this has to be 3 for color images dim3 blocks(blk_x * bCount, blk_y * in.dims[3]); // clamp spatical and chromatic sigma's float space_ = std::min(11.5f, s_sigma); int radius = std::max((int)(space_ * 1.5f), 1); int padding = 2*radius+1; const float cvar = c_sigma*c_sigma; size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T); if (is_color) CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size, out, in, space_, radius, cvar, iter, blk_x, blk_y); else CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size, out, in, space_, radius, cvar, iter, blk_x, blk_y); POST_LAUNCH_CHECK(); }
void transform(Param<T> out, CParam<T> in, CParam<float> tf, const bool inverse) { int nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const int ntransforms = out.dims[2] / in.dims[2]; // Copy transform to constant memory. CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); const int blocksXPerImage = blocks.x; if(nimages > TI) { int tile_images = divup(nimages, TI); nimages = TI; blocks.x = blocks.x * tile_images; } if (ntransforms > 1) { blocks.y *= ntransforms; } if(inverse) { CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads, out, in, nimages, ntransforms, blocksXPerImage); } else { CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads, out, in, nimages, ntransforms, blocksXPerImage); } POST_LAUNCH_CHECK(); }
static void scan_dim_nonfinal_launcher(Param<To> out, Param<To> tmp, Param<char> tflg, Param<int> tlid, CParam<Ti> in, CParam<Tk> key, const int dim, const uint threads_y, const uint blocks_all[4], bool inclusive_scan) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_all[0] * blocks_all[2], blocks_all[1] * blocks_all[3]); uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim])); switch (threads_y) { case 8: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 8>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; case 4: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 4>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; case 2: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 2>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; case 1: CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 1>), blocks, threads, out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break; } POST_LAUNCH_CHECK(); }
void shift(Param<T> out, CParam<T> in, const int *sdims) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); int sdims_[4]; // Need to do this because we are mapping output to input in the kernel for(int i = 0; i < 4; i++) { // sdims_[i] will always be positive and always [0, oDims[i]]. // Negative shifts are converted to position by going the other way round sdims_[i] = -(sdims[i] % (int)out.dims[i]) + out.dims[i] * (sdims[i] > 0); assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]); } CUDA_LAUNCH((shift_kernel<T>), blocks, threads, out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3], blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void sort0_by_key(Param<Tk> okey, Param<Tv> oval) { thrust::device_ptr<Tk> okey_ptr = thrust::device_pointer_cast(okey.ptr); thrust::device_ptr<Tv> oval_ptr = thrust::device_pointer_cast(oval.ptr); for(int w = 0; w < okey.dims[3]; w++) { int okeyW = w * okey.strides[3]; int ovalW = w * oval.strides[3]; for(int z = 0; z < okey.dims[2]; z++) { int okeyWZ = okeyW + z * okey.strides[2]; int ovalWZ = ovalW + z * oval.strides[2]; for(int y = 0; y < okey.dims[1]; y++) { int okeyOffset = okeyWZ + y * okey.strides[1]; int ovalOffset = ovalWZ + y * oval.strides[1]; if(isAscending) { thrust::sort_by_key(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset); } else { thrust::sort_by_key(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater<Tk>()); } } } } POST_LAUNCH_CHECK(); }
static void scan_dim_launcher(Param<To> out, Param<To> tmp, CParam<Ti> in, const uint threads_y, const uint blocks_all[4]) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_all[0] * blocks_all[2], blocks_all[1] * blocks_all[3]); uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim])); switch (threads_y) { case 8: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 8>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; case 4: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 4>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; case 2: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 2>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; case 1: CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 1>), blocks, threads, out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break; } POST_LAUNCH_CHECK(); }
void morph(Param<T> out, CParam<T> in, int windLen) { dim3 threads(kernel::THREADS_X, kernel::THREADS_Y); int blk_x = divup(in.dims[0], THREADS_X); int blk_y = divup(in.dims[1], THREADS_Y); // launch batch * blk_x blocks along x dimension dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); // calculate shared memory size int halo = windLen/2; int padding = 2*halo; int shrdLen = kernel::THREADS_X + padding + 1; // +1 for to avoid bank conflicts int shrdSize = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T); switch(windLen) { case 3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break; } POST_LAUNCH_CHECK(); }
void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims) { if (nDims==1) { const dim3 threads(THREADS, 1); /* find which dimension has non-zero # of elements */ int vDim = 0; for (int i=0; i<4; i++) { if (in.dims[i]==1) vDim++; else break; } int blks = divup(out.dims[vDim], THREADS*THRD_LOAD); dim3 blocks(blks, 1); CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim); } else { const dim3 threads(THREADS_X, THREADS_Y); int blks_x = divup(out.dims[0], threads.x); int blks_y = divup(out.dims[1], threads.y); dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]); CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y); } POST_LAUNCH_CHECK(); }
void sort0_index(Param<T> val, Param<unsigned> idx) { thrust::device_ptr<T> val_ptr = thrust::device_pointer_cast(val.ptr); thrust::device_ptr<unsigned> idx_ptr = thrust::device_pointer_cast(idx.ptr); for(int w = 0; w < val.dims[3]; w++) { int valW = w * val.strides[3]; int idxW = w * idx.strides[3]; for(int z = 0; z < val.dims[2]; z++) { int valWZ = valW + z * val.strides[2]; int idxWZ = idxW + z * idx.strides[2]; for(int y = 0; y < val.dims[1]; y++) { int valOffset = valWZ + y * val.strides[1]; int idxOffset = idxWZ + y * idx.strides[1]; THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]); if(isAscending) { THRUST_SELECT(thrust::sort_by_key, val_ptr + valOffset, val_ptr + valOffset + val.dims[0], idx_ptr + idxOffset); } else { THRUST_SELECT(thrust::sort_by_key, val_ptr + valOffset, val_ptr + valOffset + val.dims[0], idx_ptr + idxOffset, thrust::greater<T>()); } } } } POST_LAUNCH_CHECK(); }
static void identity(Param<T> out) { dim3 threads(32, 8); int blocks_x = divup(out.dims[0], threads.x); int blocks_y = divup(out.dims[1], threads.y); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y); POST_LAUNCH_CHECK(); }
void coo2dense(Param<T> output, CParam<T> values, CParam<int> rowIdx, CParam<int> colIdx) { dim3 threads(256, 1, 1); dim3 blocks(divup(output.dims[0], threads.x * reps), 1, 1); CUDA_LAUNCH((coo2dense_kernel<T>), blocks, threads, output, values, rowIdx, colIdx); POST_LAUNCH_CHECK(); }
void approx1(Param<Ty> out, CParam<Ty> in, CParam<Tp> pos, const float offGrid) { dim3 threads(THREADS, 1, 1); int blocksPerMat = divup(out.dims[0], threads.x); dim3 blocks(blocksPerMat * out.dims[1], out.dims[2] * out.dims[3]); approx1_kernel<Ty, Tp, method><<<blocks, threads>>> (out, in, pos, offGrid, blocksPerMat); POST_LAUNCH_CHECK(); }
void approx2(Param<Ty> out, CParam<Ty> in, CParam<Tp> pos, CParam<Tp> qos, const float offGrid) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(out.dims[0], threads.x); int blocksPerMatY = divup(out.dims[1], threads.y); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3]); approx2_kernel<Ty, Tp, method><<<blocks, threads>>> (out, in, pos, qos, offGrid, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void matchTemplate(Param<outType> out, CParam<inType> srch, CParam<inType> tmplt) { const dim3 threads(THREADS_X, THREADS_Y); int blk_x = divup(srch.dims[0], threads.x); int blk_y = divup(srch.dims[1], threads.y); dim3 blocks(blk_x*srch.dims[2], blk_y*srch.dims[3]); matchTemplate<inType, outType, mType, needMean> <<< blocks, threads >>> (out, srch, tmplt, blk_x, blk_y); POST_LAUNCH_CHECK(); }
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(in.dims[0], TX); int blocksPerMatY = divup(in.dims[1], TY); dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1); gradient_kernel<T><<<blocks, threads>>>(grad0, grad1, in, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void randu(T *out, size_t elements) { int device = getActiveDeviceId(); int threads = THREADS; int blocks = divup(elements, THREADS); if (blocks > BLOCKS) blocks = BLOCKS; curandState_t *state = getcurandState(); CUDA_LAUNCH(uniform_kernel, blocks, threads, out, state, elements); POST_LAUNCH_CHECK(); }
void join(Param<Tx> out, CParam<Tx> X, CParam<Ty> Y) { dim3 threads(TX, TY, 1); dim_type blocksPerMatX = divup(out.dims[0], TILEX); dim_type blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); join_kernel<Tx, Ty, dim><<<blocks, threads>>>(out, X, Y, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void range(Param<T> out, const int dim) { dim3 threads(RANGE_TX, RANGE_TY, 1); int blocksPerMatX = divup(out.dims[0], RANGE_TILEX); int blocksPerMatY = divup(out.dims[1], RANGE_TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); CUDA_LAUNCH((range_kernel<T>), blocks, threads, out, dim, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void iota(Param<T> out) { dim3 threads(TX, TY, 1); dim_type blocksPerMatX = divup(out.dims[0], TILEX); dim_type blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); iota_kernel<T, rep><<<blocks, threads>>>(out, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void assign(Param<T> out, CParam<T> in, const AssignKernelParam_t& p) { const dim3 threads(THREADS_X, THREADS_Y); int blks_x = divup(in.dims[0], threads.x); int blks_y = divup(in.dims[1], threads.y); dim3 blocks(blks_x*in.dims[2], blks_y*in.dims[3]); AssignKernel<T> <<<blocks, threads>>> (out, in, p, blks_x, blks_y); POST_LAUNCH_CHECK(); }
void susan_responses(T* out, const T* in, const unsigned idim0, const unsigned idim1, const int radius, const float t, const float g, const unsigned edge) { dim3 threads(BLOCK_X, BLOCK_Y); dim3 blocks(divup(idim0 - edge * 2, BLOCK_X), divup(idim1 - edge * 2, BLOCK_Y)); const size_t SMEM_SIZE = (BLOCK_X + 2 * radius) * (BLOCK_Y + 2 * radius) * sizeof(T); CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE, out, in, idim0, idim1, radius, t, g, edge); POST_LAUNCH_CHECK(); }
void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx) { dim3 threads(THREADS_X, THREADS_Y); dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]); dim_t reps = divup((wx * wy), threads.y); CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, reps); POST_LAUNCH_CHECK(); }
void exampleFunc(Param<T> out, CParam<T> in, const af_someenum_t p) { dim3 threads(TX, TY, 1); // set your cuda launch config for blocks dim_type blk_x = divup(out.dims[0], threads.x); dim_type blk_y = divup(out.dims[1], threads.y); dim3 blocks(blk_x, blk_y); // set your opencl launch config for grid // launch your kernel exampleFuncKernel<T> <<<blocks, threads>>> (out, in, p); POST_LAUNCH_CHECK(); // Macro for post kernel launch checks // these checks are carried ONLY IN DEBUG mode }
void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); CUDA_LAUNCH((iota_kernel<T>), blocks, threads, out, sdims[0], sdims[1], sdims[2], sdims[3], tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(in.dims[0], TX); int blocksPerMatY = divup(in.dims[1], TY); dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); CUDA_LAUNCH((gradient_kernel<T>), blocks, threads, grad0, grad1, in, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size) { const dim3 threads(THREADS_X, THREADS_Y); int blk_x = divup(in.dims[0], threads.x); int blk_y = divup(in.dims[1], threads.y); dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]); //TODO: add more cases when 5x5 and 7x7 kernels are done switch(ker_size) { case 3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break; } POST_LAUNCH_CHECK(); }
void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx) { dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0])); dim3 threads(TX, THREADS_PER_BLOCK / TX); dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]); dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256 CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, reps); POST_LAUNCH_CHECK(); }
void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval) { dim3 threads(kernel::THREADS_X, 1); int nElems = in.dims[0] * in.dims[1]; int blk_x = divup(nElems, THRD_LOAD*THREADS_X); dim3 blocks(blk_x * in.dims[2], in.dims[3]); // If nbins > MAX_BINS, we are using global memory so smem_size can be 0; int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0; CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size, out, in, nElems, nbins, minval, maxval, blk_x); POST_LAUNCH_CHECK(); }