static Kernel get_scan_dim_kernels(int kerIdx, int dim, bool isFinalPass, uint threads_y) { std::string ref_name = std::string("scan_") + std::to_string(dim) + std::string("_") + std::to_string(isFinalPass) + std::string("_") + std::string(dtype_traits<Ti>::getName()) + std::string("_") + std::string(dtype_traits<To>::getName()) + std::string("_") + std::to_string(op) + std::string("_") + std::to_string(threads_y) + std::string("_") + std::to_string(int(inclusive_scan)); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { Binary<To, op> scan; ToNumStr<To> toNumStr; std::ostringstream options; options << " -D To=" << dtype_traits<To>::getName() << " -D Ti=" << dtype_traits<Ti>::getName() << " -D T=To" << " -D dim=" << dim << " -D DIMY=" << threads_y << " -D THREADS_X=" << THREADS_X << " -D init=" << toNumStr(scan.init()) << " -D " << binOpName<op>() << " -D CPLX=" << af::iscplx<Ti>() << " -D isFinalPass="******" -D inclusive_scan=" << inclusive_scan; if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {ops_cl, scan_dim_cl}; const int ker_lens[] = {ops_cl_len, scan_dim_cl_len}; cl::Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "scan_dim_kernel"); entry.ker[1] = Kernel(*entry.prog, "bcast_dim_kernel"); addKernelToCache(device, ref_name, entry); } return entry.ker[kerIdx]; }
void sparseArithOpCSR(Param out, const Param values, const Param rowIdx, const Param colIdx, const Param rhs, const bool reverse) { std::string ref_name = std::string("sparseArithOpCSR_") + getOpString<op>() + std::string("_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D OP=" << getOpString<op>(); if((af_dtype) dtype_traits<T>::af_type == c32 || (af_dtype) dtype_traits<T>::af_type == c64) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {sparse_arith_common_cl , sparse_arith_csr_cl}; const int ker_lens[] = {sparse_arith_common_cl_len, sparse_arith_csr_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "sparse_arith_csr_kernel"); addKernelToCache(device, ref_name, entry); } auto sparseArithCSROp = KernelFunctor<Buffer, const KParam, const Buffer, const Buffer, const Buffer, const int, const Buffer, const KParam, const int>(*entry.ker); NDRange local(TX, TY, 1); NDRange global(divup(out.info.dims[0], TY) * TX, TY, 1); sparseArithCSROp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *values.data, *rowIdx.data, *colIdx.data, values.info.dims[0], *rhs.data, rhs.info, reverse); CL_DEBUG_FINISH(getQueue()); }
void morph3d(Param out, const Param in, const Param mask) { std::string refName = std::string("morph3d_") + std::string(dtype_traits<T>::getName()) + std::to_string(isDilation) + std::to_string(SeLength); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::string options = generateOptionsString<T, isDilation, SeLength>(); const char* ker_strs[] = {morph_cl}; const int ker_lens[] = {morph_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "morph3d"); addKernelToCache(device, refName, entry); } auto morphOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, cl::LocalSpaceArg, int >(*entry.ker); NDRange local(CUBE_X, CUBE_Y, CUBE_Z); int blk_x = divup(in.info.dims[0], CUBE_X); int blk_y = divup(in.info.dims[1], CUBE_Y); int blk_z = divup(in.info.dims[2], CUBE_Z); // launch batch * blk_x blocks along x dimension NDRange global(blk_x * CUBE_X * in.info.dims[3], blk_y * CUBE_Y, blk_z * CUBE_Z); // copy mask/filter to constant memory cl_int se_size = sizeof(T)*SeLength*SeLength*SeLength; cl::Buffer *mBuff = bufferAlloc(se_size); getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size); // calculate shared memory size const int padding = (SeLength%2==0 ? (SeLength-1) : (2*(SeLength/2))); const int locLen = CUBE_X+padding+1; const int locArea = locLen *(CUBE_Y+padding); const int locSize = locArea*(CUBE_Z+padding); morphOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *mBuff, cl::Local(locSize*sizeof(T)), blk_x); bufferFree(mBuff); CL_DEBUG_FINISH(getQueue()); }
unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out, const unsigned idim0, const unsigned idim1, const cl::Buffer* resp_in, const unsigned edge, const unsigned max_corners) { unsigned corners_found = 0; std::string refName = std::string("non_maximal_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D NONMAX"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {susan_cl}; const int ker_lens[] = {susan_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "non_maximal"); addKernelToCache(device, refName, entry); } cl::Buffer* d_corners_found = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); auto nonMaximalOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer, unsigned, unsigned, Buffer, unsigned, unsigned>(*entry.ker); NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y); NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0], divup(idim1 - 2 * edge, local[1]) * local[1]); nonMaximalOp(EnqueueArgs(getQueue(), global, local), *x_out, *y_out, *resp_out, *d_corners_found, idim0, idim1, *resp_in, edge, max_corners); getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); bufferFree(d_corners_found); return corners_found; }
void diff(Param out, const Param in, const unsigned indims) { std::string refName = std::string("diff_kernel_") + std::string(dtype_traits<T>::getName()) + std::to_string(dim) + std::to_string(isDiff2); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D DIM=" << dim << " -D isDiff2=" << isDiff2; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } const char* ker_strs[] = {diff_cl}; const int ker_lens[] = {diff_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "diff_kernel"); addKernelToCache(device, refName, entry); } auto diffOp = KernelFunctor< Buffer, const Buffer, const KParam, const KParam, const int, const int, const int> (*entry.ker); NDRange local(TX, TY, 1); if(dim == 0 && indims == 1) { local = NDRange(TX * TY, 1, 1); } int blocksPerMatX = divup(out.info.dims[0], local[0]); int blocksPerMatY = divup(out.info.dims[1], local[1]); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); const int oElem = out.info.dims[0] * out.info.dims[1] * out.info.dims[2] * out.info.dims[3]; diffOp(EnqueueArgs(getQueue(), global, local), *out.data, *in.data, out.info, in.info, oElem, blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); }
void medfilt2(Param out, const Param in) { std::string refName = std::string("medfilt2_") + std::string(dtype_traits<T>::getName()) + std::to_string(pad) + std::to_string(w_len) + std::to_string(w_wid); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { const int ARR_SIZE = w_len * (w_wid - w_wid / 2); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D pad=" << pad << " -D AF_PAD_ZERO=" << AF_PAD_ZERO << " -D AF_PAD_SYM=" << AF_PAD_SYM << " -D ARR_SIZE=" << ARR_SIZE << " -D w_len=" << w_len << " -D w_wid=" << w_wid; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {medfilt2_cl}; const int ker_lens[] = {medfilt2_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "medfilt2"); addKernelToCache(device, refName, entry); } NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(in.info.dims[0], THREADS_X); int blk_y = divup(in.info.dims[1], THREADS_Y); NDRange global(blk_x * in.info.dims[2] * THREADS_X, blk_y * in.info.dims[3] * THREADS_Y); auto medfiltOp = KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, int, int>(*entry.ker); size_t loc_size = (THREADS_X + w_len - 1) * (THREADS_Y + w_wid - 1) * sizeof(T); medfiltOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, cl::Local(loc_size), blk_x, blk_y); CL_DEBUG_FINISH(getQueue()); }
void join(Param out, const Param in, const af::dim4 offset) { std::string refName = std::string("join_kernel_") + std::string(dtype_traits<To>::getName()) + std::string(dtype_traits<Ti>::getName()) + std::to_string(dim); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D To=" << dtype_traits<To>::getName() << " -D Ti=" << dtype_traits<Ti>::getName() << " -D dim=" << dim; if (std::is_same<To, double>::value || std::is_same<To, cdouble>::value) { options << " -D USE_DOUBLE"; } else if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value) { options << " -D USE_DOUBLE"; } const char* ker_strs[] = {join_cl}; const int ker_lens[] = {join_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "join_kernel"); addKernelToCache(device, refName, entry); } auto joinOp = KernelFunctor<Buffer, const KParam, const Buffer, const KParam, const int, const int, const int, const int, const int, const int> (*entry.ker); NDRange local(TX, TY, 1); int blocksPerMatX = divup(in.info.dims[0], TILEX); int blocksPerMatY = divup(in.info.dims[1], TILEY); NDRange global(local[0] * blocksPerMatX * in.info.dims[2], local[1] * blocksPerMatY * in.info.dims[3], 1); joinOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, offset[0], offset[1], offset[2], offset[3], blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); }
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter) { std::string ref_name = std::string("convolveND_") + std::string(dtype_traits<T>::getName()) + std::string(dtype_traits<aT>::getName()) + std::to_string(bDim) + std::to_string(expand); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D Ti=" << dtype_traits<T>::getName() << " -D To=" << dtype_traits<aT>::getName() << " -D accType=" << dtype_traits<aT>::getName() << " -D BASE_DIM=" << bDim << " -D EXPAND=" << expand << " -D " << binOpName<af_mul_t>(); if((af_dtype) dtype_traits<T>::af_type == c32 || (af_dtype) dtype_traits<T>::af_type == c64) { options << " -D CPLX=1"; } else { options << " -D CPLX=0"; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char *ker_strs[] = {ops_cl, convolve_cl}; const int ker_lens[] = {ops_cl_len, convolve_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convolve"); addKernelToCache(device, ref_name, entry); } auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, Buffer, KParam, int, int, int, int, int, int, int, int >(*entry.ker); convOp(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size), *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]); }
static void get_out_idx(Buffer *out_data, Param &otmp, Param &rtmp, Param &in, uint threads_x, uint groups_x, uint groups_y) { std::string refName = std::string("get_out_idx_kernel_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { ToNumStr<T> toNumStr; std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D zero=" << toNumStr(scalar<T>(0)) << " -D CPLX=" << af::iscplx<T>(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {where_cl}; const int ker_lens[] = {where_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "get_out_idx_kernel"); addKernelToCache(device, refName, entry); } NDRange local(threads_x, THREADS_PER_GROUP / threads_x); NDRange global(local[0] * groups_x * in.info.dims[2], local[1] * groups_y * in.info.dims[3]); uint lim = divup(otmp.info.dims[0], (threads_x * groups_x)); auto whereOp = KernelFunctor< Buffer, Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); whereOp(EnqueueArgs(getQueue(), global, local), *out_data, *otmp.data, otmp.info, *rtmp.data, rtmp.info, *in.data, in.info, groups_x, groups_y, lim); CL_DEBUG_FINISH(getQueue()); }
void laset(int m, int n, T offdiag, T diag, cl_mem dA, size_t dA_offset, magma_int_t ldda) { std::string refName = laset_name<uplo>() + std::string("_") + std::string(dtype_traits<T>::getName()) + std::to_string(uplo); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLK_X=" << BLK_X << " -D BLK_Y=" << BLK_Y << " -D IS_CPLX=" << af::iscplx<T>(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {laset_cl}; const int ker_lens[] = {laset_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, laset_name<uplo>()); addKernelToCache(device, refName, entry); } int groups_x = (m - 1) / BLK_X + 1; int groups_y = (n - 1) / BLK_Y + 1; NDRange local(BLK_X, 1); NDRange global(groups_x * local[0], groups_y * local[1]); // retain the cl_mem object during cl::Buffer creation cl::Buffer dAObj(dA, true); auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker); lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda); }
void triangle(Param out, const Param in) { std::string refName = std::string("triangle_kernel_") + std::string(dtype_traits<T>::getName()) + std::to_string(is_upper) + std::to_string(is_unit_diag); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D is_upper=" << is_upper << " -D is_unit_diag=" << is_unit_diag << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")" << " -D ONE=(T)(" << scalar_to_option(scalar<T>(1)) << ")"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {triangle_cl}; const int ker_lens[] = {triangle_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "triangle_kernel"); addKernelToCache(device, refName, entry); } NDRange local(TX, TY); int groups_x = divup(out.info.dims[0], TILEX); int groups_y = divup(out.info.dims[1], TILEY); NDRange global(groups_x * out.info.dims[2] * local[0], groups_y * out.info.dims[3] * local[1]); auto triangleOp = KernelFunctor< Buffer, KParam, const Buffer, KParam, const int, const int >(*entry.ker); triangleOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, groups_x, groups_y); CL_DEBUG_FINISH(getQueue()); }
void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims) { std::string refName = std::string("iota_kernel_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {iota_cl}; const int ker_lens[] = {iota_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "iota_kernel"); addKernelToCache(device, refName, entry); } auto iotaOp = KernelFunctor<Buffer, const KParam, const int, const int, const int, const int, const int, const int, const int, const int, const int, const int> (*entry.ker); NDRange local(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.info.dims[0], TILEX); int blocksPerMatY = divup(out.info.dims[1], TILEY); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); iotaOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, sdims[0], sdims[1], sdims[2], sdims[3], tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); }
void susan(cl::Buffer* out, const cl::Buffer* in, const unsigned in_off, const unsigned idim0, const unsigned idim1, const float t, const float g, const unsigned edge) { std::string refName = std::string("susan_responses_") + std::string(dtype_traits<T>::getName()) + std::to_string(radius); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { const size_t LOCAL_MEM_SIZE = (SUSAN_THREADS_X + 2 * radius) * (SUSAN_THREADS_Y + 2 * radius); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE << " -D BLOCK_X=" << SUSAN_THREADS_X << " -D BLOCK_Y=" << SUSAN_THREADS_Y << " -D RADIUS=" << radius << " -D RESPONSE"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {susan_cl}; const int ker_lens[] = {susan_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "susan_responses"); addKernelToCache(device, refName, entry); } auto susanOp = KernelFunctor<Buffer, Buffer, unsigned, unsigned, unsigned, float, float, unsigned>(*entry.ker); NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y); NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0], divup(idim1 - 2 * edge, local[1]) * local[1]); susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, in_off, idim0, idim1, t, g, edge); }
void hsv2rgb_convert(Param out, const Param in) { std::string refName = std::string("hsvrgb_convert_") + std::string(dtype_traits<T>::getName()) + std::to_string(isHSV2RGB); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if(isHSV2RGB) options << " -D isHSV2RGB"; if (std::is_same<T, double>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {hsv_rgb_cl}; const int ker_lens[] = {hsv_rgb_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convert"); addKernelToCache(device, refName, entry); } NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(in.info.dims[0], THREADS_X); int blk_y = divup(in.info.dims[1], THREADS_Y); // all images are three channels, so batch // parameter would be along 4th dimension NDRange global(blk_x * in.info.dims[3] * THREADS_X, blk_y * THREADS_Y); auto hsvrgbOp = KernelFunctor<Buffer, KParam, Buffer, KParam, int> (*entry.ker); hsvrgbOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, blk_x); CL_DEBUG_FINISH(getQueue()); }
static void identity(Param out) { std::string refName = std::string("identity_kernel") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ONE=(T)(" << scalar_to_option(scalar<T>(1)) << ")" << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } const char* ker_strs[] = {identity_cl}; const int ker_lens[] = {identity_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "identity_kernel"); addKernelToCache(device, refName, entry); } NDRange local(32, 8); int groups_x = divup(out.info.dims[0], local[0]); int groups_y = divup(out.info.dims[1], local[1]); NDRange global(groups_x * out.info.dims[2] * local[0], groups_y * out.info.dims[3] * local[1]); auto identityOp = KernelFunctor<Buffer, const KParam, int, int>(*entry.ker); identityOp(EnqueueArgs(getQueue(), global, local), *(out.data), out.info, groups_x, groups_y); CL_DEBUG_FINISH(getQueue()); }
void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx, const bool is_column) { std::string ref_name = std::string("unwrap_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(is_column); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { ToNumStr<T> toNumStr; std::ostringstream options; options << " -D is_column=" << is_column << " -D ZERO=" << toNumStr(scalar<T>(0)) << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, unwrap_cl, unwrap_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "unwrap_kernel"); addKernelToCache(device, ref_name, entry); } dim_t TX = 1, TY = 1; dim_t BX = 1; const dim_t BY = out.info.dims[2] * out.info.dims[3]; dim_t reps = 1; if (is_column) { TX = std::min(THREADS_PER_GROUP, nextpow2(out.info.dims[0])); TY = THREADS_PER_GROUP / TX; BX = divup(out.info.dims[1], TY); reps = divup((wx * wy), TX); } else { TX = THREADS_X; TY = THREADS_Y; BX = divup(out.info.dims[0], TX); reps = divup((wx * wy), TY); } NDRange local(TX, TY); NDRange global(local[0] * BX, local[1] * BY); auto unwrapOp = KernelFunctor<Buffer, const KParam, const Buffer, const KParam, const int, const int, const int, const int, const int, const int, const int, const int>(*entry.ker); unwrapOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, reps); CL_DEBUG_FINISH(getQueue()); }
void csrmv(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; // FIXME: Find a better number based on average non zeros per row int threads = 64; std::string ref_name = std::string("csrmv_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy) + std::string("_") + std::to_string(threads); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS=" << threads; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmv_cl}; const int ker_lens[] = {csrmv_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmv_thread"); entry.ker[1] = Kernel(*entry.prog, "csrmv_block"); addKernelToCache(device, ref_name, entry); } int count = 0; cl::Buffer *counter = bufferAlloc(sizeof(int)); getQueue().enqueueWriteBuffer(*counter, CL_TRUE, 0, sizeof(int), (void *)&count); // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block bool is_csrmv_block = true; auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0]; auto csrmv_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, Buffer, KParam, T, T, Buffer>(csrmv_kernel); NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]); groups_x = std::min(groups_x, MAX_CSRMV_GROUPS); NDRange global(local[0] * groups_x, 1); csrmv_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, *rhs.data, rhs.info, alpha, beta, *counter); CL_DEBUG_FINISH(getQueue()); bufferFree(counter); }
void csrmm_nt(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; std::string ref_name = std::string("csrmm_nt_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmm_cl}; const int ker_lens[] = {csrmm_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmm_nt"); // FIXME: Change this after adding another kernel entry.ker[1] = Kernel(*entry.prog, "csrmm_nt"); addKernelToCache(device, ref_name, entry); } auto csrmm_nt_kernel = entry.ker[0]; auto csrmm_nt_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam, T, T, Buffer>(csrmm_nt_kernel); NDRange local(THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int N = rhs.info.dims[0]; int groups_x = divup(N, local[0]); int groups_y = divup(M, REPEAT); groups_y = std::min(groups_y, MAX_CSRMM_GROUPS); NDRange global(local[0] * groups_x, local[1] * groups_y); std::vector<int> count(groups_x); cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int)); getQueue().enqueueWriteBuffer( *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data()); csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data, rhs.info, alpha, beta, *counter); bufferFree(counter); }
void mean_first_launcher(Param out, Param owt, Param in, Param inWeight, const int threads_x, const uint groups_x, const uint groups_y) { bool input_weight = ((inWeight.info.dims[0] * inWeight.info.dims[1] * inWeight.info.dims[2] * inWeight.info.dims[3]) != 0); bool output_weight = (( owt.info.dims[0] * owt.info.dims[1] * owt.info.dims[2] * owt.info.dims[3]) != 0); std::string ref_name = std::string("mean_0_") + std::string(dtype_traits<Ti>::getName()) + std::string("_") + std::string(dtype_traits<Tw>::getName()) + std::string("_") + std::string(dtype_traits<To>::getName()) + std::string("_") + std::to_string(threads_x) + std::string("_") + std::to_string(input_weight) + std::string("_") + std::to_string(output_weight); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { Binary<To, af_add_t> mean; ToNumStr<To> toNumStr; ToNumStr<Tw> twNumStr; Transform<uint, Tw, af_add_t> transform_weight; std::ostringstream options; options << " -D Ti=" << dtype_traits<Ti>::getName() << " -D Tw=" << dtype_traits<Tw>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D DIMX=" << threads_x << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP << " -D init_To=" << toNumStr(mean.init()) << " -D init_Tw=" << twNumStr(transform_weight(0)) << " -D one_Tw=" << twNumStr(transform_weight(1)); if (input_weight) { options << " -D INPUT_WEIGHT"; } if (output_weight) { options << " -D OUTPUT_WEIGHT"; } if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value || std::is_same<To, double>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {mean_ops_cl, mean_first_cl}; const int ker_lens[] = {mean_ops_cl_len, mean_first_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "mean_first_kernel"); addKernelToCache(device, ref_name, entry); } NDRange local(threads_x, THREADS_PER_GROUP / threads_x); NDRange global(groups_x * in.info.dims[2] * local[0], groups_y * in.info.dims[3] * local[1]); uint repeat = divup(in.info.dims[0], (local[0] * groups_x)); if (input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, groups_x, groups_y, repeat); } else if ( input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, groups_x, groups_y, repeat); } CL_DEBUG_FINISH(getQueue()); }
void resize(Param out, const Param in) { typedef typename dtype_traits<T>::base_type BT; std::string refName = std::string("reorder_kernel_") + std::string(dtype_traits<T>::getName()) + std::to_string(method); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D VT=" << dtype_traits<vtype_t<T>>::getName(); options << " -D WT=" << dtype_traits<wtype_t<BT>>::getName(); switch (method) { case AF_INTERP_NEAREST: options << " -D INTERP=NEAREST"; break; case AF_INTERP_BILINEAR: options << " -D INTERP=BILINEAR"; break; case AF_INTERP_LOWER: options << " -D INTERP=LOWER"; break; default: break; } if ((af_dtype)dtype_traits<T>::af_type == c32 || (af_dtype)dtype_traits<T>::af_type == c64) { options << " -D CPLX=1"; options << " -D TB=" << dtype_traits<BT>::getName(); } else { options << " -D CPLX=0"; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {resize_cl}; const int ker_lens[] = {resize_cl_len}; cl::Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new cl::Program(prog); entry.ker = new cl::Kernel(*entry.prog, "resize_kernel"); addKernelToCache(device, refName, entry); } auto resizeOp = cl::KernelFunctor<cl::Buffer, const KParam, const cl::Buffer, const KParam, const int, const int, const float, const float>(*entry.ker); cl::NDRange local(RESIZE_TX, RESIZE_TY, 1); int blocksPerMatX = divup(out.info.dims[0], local[0]); int blocksPerMatY = divup(out.info.dims[1], local[1]); cl::NDRange global(local[0] * blocksPerMatX * in.info.dims[2], local[1] * blocksPerMatY * in.info.dims[3], 1); double xd = (double)in.info.dims[0] / (double)out.info.dims[0]; double yd = (double)in.info.dims[1] / (double)out.info.dims[1]; float xf = (float)xd, yf = (float)yd; resizeOp(cl::EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, blocksPerMatX, blocksPerMatY, xf, yf); CL_DEBUG_FINISH(getQueue()); }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb, cl_command_queue queue) { std::string refName = std::string("swapdblk_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {swapdblk_cl}; const int ker_lens[] = {swapdblk_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "swapdblk"); addKernelToCache(device, refName, entry); } int nblocks = n / nb; if (nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks - 1) * nb * inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks - 1) * nb * incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); cl::Buffer dAObj(dA, true); cl::Buffer dBObj(dB, true); auto swapdOp = KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer, unsigned long long, int, int>(*entry.ker); cl::CommandQueue q(queue); swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca, dBObj, dB_offset, lddb, incb); }