static void get_out_idx(Buffer *out_data, Param &otmp, Param &rtmp, Param &in, uint threads_x, uint groups_x, uint groups_y) { std::string refName = std::string("get_out_idx_kernel_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { ToNumStr<T> toNumStr; std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D zero=" << toNumStr(scalar<T>(0)) << " -D CPLX=" << af::iscplx<T>(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {where_cl}; const int ker_lens[] = {where_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "get_out_idx_kernel"); addKernelToCache(device, refName, entry); } NDRange local(threads_x, THREADS_PER_GROUP / threads_x); NDRange global(local[0] * groups_x * in.info.dims[2], local[1] * groups_y * in.info.dims[3]); uint lim = divup(otmp.info.dims[0], (threads_x * groups_x)); auto whereOp = KernelFunctor< Buffer, Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); whereOp(EnqueueArgs(getQueue(), global, local), *out_data, *otmp.data, otmp.info, *rtmp.data, rtmp.info, *in.data, in.info, groups_x, groups_y, lim); CL_DEBUG_FINISH(getQueue()); }
void laset(int m, int n, T offdiag, T diag, cl_mem dA, size_t dA_offset, magma_int_t ldda) { std::string refName = laset_name<uplo>() + std::string("_") + std::string(dtype_traits<T>::getName()) + std::to_string(uplo); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLK_X=" << BLK_X << " -D BLK_Y=" << BLK_Y << " -D IS_CPLX=" << af::iscplx<T>(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {laset_cl}; const int ker_lens[] = {laset_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, laset_name<uplo>()); addKernelToCache(device, refName, entry); } int groups_x = (m - 1) / BLK_X + 1; int groups_y = (n - 1) / BLK_Y + 1; NDRange local(BLK_X, 1); NDRange global(groups_x * local[0], groups_y * local[1]); // retain the cl_mem object during cl::Buffer creation cl::Buffer dAObj(dA, true); auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker); lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda); }
void iota(Param out) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> iotaProgs; static std::map<int, Kernel*> iotaKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D rep=" << rep; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, iota_cl, iota_cl_len, options.str()); iotaProgs[device] = new Program(prog); iotaKernels[device] = new Kernel(*iotaProgs[device], "iota_kernel"); }); auto iotaOp = make_kernel<Buffer, const KParam, const dim_type, const dim_type> (*iotaKernels[device]); NDRange local(TX, TY, 1); dim_type blocksPerMatX = divup(out.info.dims[0], TILEX); dim_type blocksPerMatY = divup(out.info.dims[1], TILEY); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); iotaOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void triangle(Param out, const Param in) { std::string refName = std::string("triangle_kernel_") + std::string(dtype_traits<T>::getName()) + std::to_string(is_upper) + std::to_string(is_unit_diag); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D is_upper=" << is_upper << " -D is_unit_diag=" << is_unit_diag << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")" << " -D ONE=(T)(" << scalar_to_option(scalar<T>(1)) << ")"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {triangle_cl}; const int ker_lens[] = {triangle_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "triangle_kernel"); addKernelToCache(device, refName, entry); } NDRange local(TX, TY); int groups_x = divup(out.info.dims[0], TILEX); int groups_y = divup(out.info.dims[1], TILEY); NDRange global(groups_x * out.info.dims[2] * local[0], groups_y * out.info.dims[3] * local[1]); auto triangleOp = KernelFunctor< Buffer, KParam, const Buffer, KParam, const int, const int >(*entry.ker); triangleOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, groups_x, groups_y); CL_DEBUG_FINISH(getQueue()); }
void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims) { std::string refName = std::string("iota_kernel_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {iota_cl}; const int ker_lens[] = {iota_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "iota_kernel"); addKernelToCache(device, refName, entry); } auto iotaOp = KernelFunctor<Buffer, const KParam, const int, const int, const int, const int, const int, const int, const int, const int, const int, const int> (*entry.ker); NDRange local(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.info.dims[0], TILEX); int blocksPerMatY = divup(out.info.dims[1], TILEY); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); iotaOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, sdims[0], sdims[1], sdims[2], sdims[3], tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); }
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> convProgs; static std::map<int, Kernel*> convKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<aT>::getName() << " -D BASE_DIM="<< bDim << " -D EXPAND=" << expand; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_cl, convolve_cl_len, options.str()); convProgs[device] = new Program(prog); convKernels[device] = new Kernel(*convProgs[device], "convolve"); }); auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, Buffer, KParam, int, int, int, int, int, int, int, int >(*convKernels[device]); convOp(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size), *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void susan(cl::Buffer* out, const cl::Buffer* in, const unsigned in_off, const unsigned idim0, const unsigned idim1, const float t, const float g, const unsigned edge) { std::string refName = std::string("susan_responses_") + std::string(dtype_traits<T>::getName()) + std::to_string(radius); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { const size_t LOCAL_MEM_SIZE = (SUSAN_THREADS_X + 2 * radius) * (SUSAN_THREADS_Y + 2 * radius); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE << " -D BLOCK_X=" << SUSAN_THREADS_X << " -D BLOCK_Y=" << SUSAN_THREADS_Y << " -D RADIUS=" << radius << " -D RESPONSE"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {susan_cl}; const int ker_lens[] = {susan_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "susan_responses"); addKernelToCache(device, refName, entry); } auto susanOp = KernelFunctor<Buffer, Buffer, unsigned, unsigned, unsigned, float, float, unsigned>(*entry.ker); NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y); NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0], divup(idim1 - 2 * edge, local[1]) * local[1]); susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, in_off, idim0, idim1, t, g, edge); }
void hsv2rgb_convert(Param out, const Param in) { std::string refName = std::string("hsvrgb_convert_") + std::string(dtype_traits<T>::getName()) + std::to_string(isHSV2RGB); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if(isHSV2RGB) options << " -D isHSV2RGB"; if (std::is_same<T, double>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {hsv_rgb_cl}; const int ker_lens[] = {hsv_rgb_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convert"); addKernelToCache(device, refName, entry); } NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(in.info.dims[0], THREADS_X); int blk_y = divup(in.info.dims[1], THREADS_Y); // all images are three channels, so batch // parameter would be along 4th dimension NDRange global(blk_x * in.info.dims[3] * THREADS_X, blk_y * THREADS_Y); auto hsvrgbOp = KernelFunctor<Buffer, KParam, Buffer, KParam, int> (*entry.ker); hsvrgbOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, blk_x); CL_DEBUG_FINISH(getQueue()); }
static void identity(Param out) { std::string refName = std::string("identity_kernel") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ONE=(T)(" << scalar_to_option(scalar<T>(1)) << ")" << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } const char* ker_strs[] = {identity_cl}; const int ker_lens[] = {identity_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "identity_kernel"); addKernelToCache(device, refName, entry); } NDRange local(32, 8); int groups_x = divup(out.info.dims[0], local[0]); int groups_y = divup(out.info.dims[1], local[1]); NDRange global(groups_x * out.info.dims[2] * local[0], groups_y * out.info.dims[3] * local[1]); auto identityOp = KernelFunctor<Buffer, const KParam, int, int>(*entry.ker); identityOp(EnqueueArgs(getQueue(), global, local), *(out.data), out.info, groups_x, groups_y); CL_DEBUG_FINISH(getQueue()); }
static void bcast_dim_launcher(Param &out, Param &tmp, const uint groups_all[4]) { Kernel ker = get_scan_dim_kernels<Ti, To, op, dim, isFinalPass, threads_y>(1); NDRange local(THREADS_X, threads_y); NDRange global(groups_all[0] * groups_all[2] * local[0], groups_all[1] * groups_all[3] * local[1]); uint lim = divup(out.info.dims[dim], (threads_y * groups_all[dim])); auto bcastOp = make_kernel<Buffer, KParam, Buffer, KParam, uint, uint, uint, uint>(ker); bcastOp(EnqueueArgs(getQueue(), global, local), out.data, out.info, tmp.data, tmp.info, groups_all[0], groups_all[1], groups_all[dim], lim); CL_DEBUG_FINISH(getQueue()); }
void set(Buffer &ptr, T val, const size_t &elements) { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program setProgs[DeviceManager::MAX_DEVICES]; static Kernel setKernels[DeviceManager::MAX_DEVICES]; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { Program::Sources setSrc; setSrc.emplace_back(set_cl, set_cl_len); setProgs[device] = Program(getContext(), setSrc); string opt = string("-D T=") + dtype_traits<T>::getName(); setProgs[device].build(opt.c_str()); setKernels[device] = Kernel(setProgs[device], "set"); }); auto setKern = make_kernel<Buffer, T, const unsigned long>(setKernels[device]); setKern(EnqueueArgs(getQueue(), NDRange(elements)), ptr, val, elements); }
void copy(Param dst, const Param src, int ndims, outType default_value, double factor) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> cpyProgs; static std::map<int, Kernel*> cpyKernels; int device = getActiveDeviceId(); std::call_once(compileFlags[device], [&]() { std::ostringstream options; options << " -D inType=" << dtype_traits<inType>::getName() << " -D outType=" << dtype_traits<outType>::getName() << " -D inType_" << dtype_traits<inType>::getName() << " -D outType_" << dtype_traits<outType>::getName() << " -D SAME_DIMS=" << same_dims; if (std::is_same<inType, double>::value || std::is_same<inType, cdouble>::value || std::is_same<outType, double>::value || std::is_same<outType, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, copy_cl, copy_cl_len, options.str()); cpyProgs[device] = new Program(prog); cpyKernels[device] = new Kernel(*cpyProgs[device], "copy"); }); NDRange local(DIM0, DIM1); size_t local_size[] = {DIM0, DIM1}; local_size[0] *= local_size[1]; if (ndims == 1) { local_size[1] = 1; } int blk_x = divup(dst.info.dims[0], local_size[0]); int blk_y = divup(dst.info.dims[1], local_size[1]); NDRange global(blk_x * dst.info.dims[2] * DIM0, blk_y * dst.info.dims[3] * DIM1); dims_t trgt_dims; if (same_dims) { trgt_dims= {{dst.info.dims[0], dst.info.dims[1], dst.info.dims[2], dst.info.dims[3]}}; } else { dim_t trgt_l = std::min(dst.info.dims[3], src.info.dims[3]); dim_t trgt_k = std::min(dst.info.dims[2], src.info.dims[2]); dim_t trgt_j = std::min(dst.info.dims[1], src.info.dims[1]); dim_t trgt_i = std::min(dst.info.dims[0], src.info.dims[0]); trgt_dims= {{trgt_i, trgt_j, trgt_k, trgt_l}}; } auto copyOp = KernelFunctor<Buffer, KParam, Buffer, KParam, outType, float, dims_t, int, int >(*cpyKernels[device]); copyOp(EnqueueArgs(getQueue(), global, local), *dst.data, dst.info, *src.data, src.info, default_value, (float)factor, trgt_dims, blk_x, blk_y); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void csrmm_nt(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; std::string ref_name = std::string("csrmm_nt_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmm_cl}; const int ker_lens[] = {csrmm_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmm_nt"); // FIXME: Change this after adding another kernel entry.ker[1] = Kernel(*entry.prog, "csrmm_nt"); addKernelToCache(device, ref_name, entry); } auto csrmm_nt_kernel = entry.ker[0]; auto csrmm_nt_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam, T, T, Buffer>(csrmm_nt_kernel); NDRange local(THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int N = rhs.info.dims[0]; int groups_x = divup(N, local[0]); int groups_y = divup(M, REPEAT); groups_y = std::min(groups_y, MAX_CSRMM_GROUPS); NDRange global(local[0] * groups_x, local[1] * groups_y); std::vector<int> count(groups_x); cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int)); getQueue().enqueueWriteBuffer( *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data()); csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data, rhs.info, alpha, beta, *counter); bufferFree(counter); }
void approx1(Param out, const Param in, const Param pos, const float offGrid) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> approxProgs; static std::map<int, Kernel*> approxKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { ToNum<Ty> toNum; std::ostringstream options; options << " -D Ty=" << dtype_traits<Ty>::getName() << " -D Tp=" << dtype_traits<Tp>::getName() << " -D ZERO=" << toNum(scalar<Ty>(0)); if((af_dtype) dtype_traits<Ty>::af_type == c32 || (af_dtype) dtype_traits<Ty>::af_type == c64) { options << " -D CPLX=1"; } else { options << " -D CPLX=0"; } if (std::is_same<Ty, double>::value || std::is_same<Ty, cdouble>::value) { options << " -D USE_DOUBLE"; } switch(method) { case AF_INTERP_NEAREST: options << " -D INTERP=NEAREST"; break; case AF_INTERP_LINEAR: options << " -D INTERP=LINEAR"; break; default: break; } Program prog; buildProgram(prog, approx1_cl, approx1_cl_len, options.str()); approxProgs[device] = new Program(prog); approxKernels[device] = new Kernel(*approxProgs[device], "approx1_kernel"); }); auto approx1Op = make_kernel<Buffer, const KParam, const Buffer, const KParam, const Buffer, const KParam, const float, const int> (*approxKernels[device]); NDRange local(THREADS, 1, 1); int blocksPerMat = divup(out.info.dims[0], local[0]); NDRange global(blocksPerMat * local[0] * out.info.dims[1], out.info.dims[2] * out.info.dims[3] * local[0], 1); approx1Op(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *pos.data, pos.info, offGrid, blocksPerMat); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void fast(unsigned* out_feat, Param &x_out, Param &y_out, Param &score_out, Param in, const float thr, const float feature_ratio, const unsigned edge) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fastProgs; static std::map<int, Kernel*> lfKernel; static std::map<int, Kernel*> nmKernel; static std::map<int, Kernel*> gfKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ARC_LENGTH=" << arc_length << " -D NONMAX=" << static_cast<unsigned>(nonmax); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fast_cl, fast_cl_len, options.str()); fastProgs[device] = new Program(prog); lfKernel[device] = new Kernel(*fastProgs[device], "locate_features"); nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts"); gfKernel[device] = new Kernel(*fastProgs[device], "get_features"); }); const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio); // Matrix containing scores for detected features, scores are stored in the // same coordinates as features, dimensions should be equal to in. cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float)); std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0); getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]); cl::Buffer *d_flags = d_score; if (nonmax) { d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T)); } const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X); const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y); // Locate features kernel sizes const NDRange local(FAST_THREADS_X, FAST_THREADS_Y); const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y); auto lfOp = make_kernel<Buffer, KParam, Buffer, const float, const unsigned, LocalSpaceArg> (*lfKernel[device]); lfOp(EnqueueArgs(getQueue(), global, local), *in.data, in.info, *d_score, thr, edge, cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T))); CL_DEBUG_FINISH(getQueue()); const int blk_nonmax_x = divup(in.info.dims[0], 64); const int blk_nonmax_y = divup(in.info.dims[1], 64); // Nonmax kernel sizes const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y); const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y); unsigned count_init = 0; cl::Buffer *d_total = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init); //size_t *global_nonmax_dims = global_nonmax(); size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned); cl::Buffer *d_counts = bufferAlloc(blocks_sz); cl::Buffer *d_offsets = bufferAlloc(blocks_sz); auto nmOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned> (*nmKernel[device]); nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge); CL_DEBUG_FINISH(getQueue()); unsigned total; getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total); total = total < max_feat ? total : max_feat; if (total > 0) { size_t out_sz = total * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); auto gfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned, const unsigned> (*gfKernel[device]); gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *x_out.data, *y_out.data, *score_out.data, *d_flags, *d_counts, *d_offsets, in.info, total, edge); CL_DEBUG_FINISH(getQueue()); } *out_feat = total; x_out.info.dims[0] = total; x_out.info.strides[0] = 1; y_out.info.dims[0] = total; y_out.info.strides[0] = 1; score_out.info.dims[0] = total; score_out.info.strides[0] = 1; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = total; y_out.info.dims[k] = 1; y_out.info.strides[k] = total; score_out.info.dims[k] = 1; score_out.info.strides[k] = total; } bufferFree(d_score); if (nonmax) bufferFree(d_flags); bufferFree(d_total); bufferFree(d_counts); bufferFree(d_offsets); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void reorderOutputHelper(Param out, Param packed, Param sig, Param filter, const int baseDim, ConvolveBatchKind kind) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fftconvolveProgs; static std::map<int, Kernel*> roKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ROUND_OUT=" << (int)roundOut << " -D EXPAND=" << (int)expand; if ((af_dtype) dtype_traits<convT>::af_type == c32) { options << " -D CONVT=float"; } else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) { options << " -D CONVT=double" << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fftconvolve_reorder_cl, fftconvolve_reorder_cl_len, options.str()); fftconvolveProgs[device] = new Program(prog); roKernel[device] = new Kernel(*fftconvolveProgs[device], "reorder_output"); }); Param sig_tmp, filter_tmp; calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind); // Number of packed complex elements in dimension 0 int sig_half_d0 = divup(sig.info.dims[0], 2); int blocks = divup(out.info.strides[3] * out.info.dims[3], THREADS); NDRange local(THREADS); NDRange global(blocks * THREADS); auto roOp = make_kernel<Buffer, KParam, Buffer, KParam, KParam, const int, const int> (*roKernel[device]); if (kind == ONE2MANY) { roOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *filter_tmp.data, filter_tmp.info, filter.info, sig_half_d0, baseDim); } else { roOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *sig_tmp.data, sig_tmp.info, filter.info, sig_half_d0, baseDim); } CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx, const bool is_column) { std::string ref_name = std::string("unwrap_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(is_column); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { ToNumStr<T> toNumStr; std::ostringstream options; options << " -D is_column=" << is_column << " -D ZERO=" << toNumStr(scalar<T>(0)) << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, unwrap_cl, unwrap_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "unwrap_kernel"); addKernelToCache(device, ref_name, entry); } dim_t TX = 1, TY = 1; dim_t BX = 1; const dim_t BY = out.info.dims[2] * out.info.dims[3]; dim_t reps = 1; if (is_column) { TX = std::min(THREADS_PER_GROUP, nextpow2(out.info.dims[0])); TY = THREADS_PER_GROUP / TX; BX = divup(out.info.dims[1], TY); reps = divup((wx * wy), TX); } else { TX = THREADS_X; TY = THREADS_Y; BX = divup(out.info.dims[0], TX); reps = divup((wx * wy), TY); } NDRange local(TX, TY); NDRange global(local[0] * BX, local[1] * BY); auto unwrapOp = KernelFunctor<Buffer, const KParam, const Buffer, const KParam, const int, const int, const int, const int, const int, const int, const int, const int>(*entry.ker); unwrapOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, reps); CL_DEBUG_FINISH(getQueue()); }
void exampleFunc(Param c, const Param a, const Param b, const af_someenum_t p) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> egProgs; static std::map<int, Kernel*> egKernels; int device = getActiveDeviceId(); // std::call_once is used to ensure OpenCL kernels // are compiled only once for any given device and combination // of template parameters to this kernel wrapper function 'exampleFunc<T>' std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); // You can pass any template parameters as compile options // to kernel the compilation step. This is equivalent of // having templated kernels in CUDA // The following option is passed to kernel compilation // if template parameter T is double or complex double // to enable FP64 extension if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; // below helper function 'buildProgram' uses the option string // we just created and compiles the kernel string // 'example_cl' which was created by our opencl kernel code obfuscation // stage buildProgram(prog, example_cl, example_cl_len, options.str()); // create a cl::Program object on heap egProgs[device] = new Program(prog); // create a cl::Kernel object on heap egKernels[device] = new Kernel(*egProgs[device], "example"); }); // configure work group parameters NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(c.info.dims[0], THREADS_X); int blk_y = divup(c.info.dims[1], THREADS_Y); // configure global launch parameters NDRange global(blk_x * THREADS_X, blk_y * THREADS_Y); // create a kernel functor from the cl::Kernel object // corresponding to the device on which current execution // is happending. auto exampleFuncOp = KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, KParam, int>(*egKernels[device]); // launch the kernel exampleFuncOp(EnqueueArgs(getQueue(), global, local), *c.data, c.info, *a.data, a.info, *b.data, b.info, (int)p); // Below Macro activates validations ONLY in DEBUG // mode as its name indicates CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { // Catch all cl::Errors and convert them // to appropriate ArrayFire error codes CL_TO_AF_ERROR(err); } }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb) { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> swpProgs; static std::map<int, Kernel*> swpKernels; int device = getActiveDeviceId(); std::call_once(compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, swapdblk_cl, swapdblk_cl_len, options.str()); swpProgs[device] = new Program(prog); swpKernels[device] = new Kernel(*swpProgs[device], "swapdblk"); }); int nblocks = n / nb; if(nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks-1)*nb*inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks-1)*nb*incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); auto swapdOp = make_kernel<int, cl_mem, unsigned long long, int, int, cl_mem, unsigned long long, int, int>(*swpKernels[device]); swapdOp(EnqueueArgs(getQueue(), global, local), nb, dA, dA_offset, ldda, inca, dB, dB_offset, lddb, incb); }
void mean_first_launcher(Param out, Param owt, Param in, Param inWeight, const int threads_x, const uint groups_x, const uint groups_y) { bool input_weight = ((inWeight.info.dims[0] * inWeight.info.dims[1] * inWeight.info.dims[2] * inWeight.info.dims[3]) != 0); bool output_weight = (( owt.info.dims[0] * owt.info.dims[1] * owt.info.dims[2] * owt.info.dims[3]) != 0); std::string ref_name = std::string("mean_0_") + std::string(dtype_traits<Ti>::getName()) + std::string("_") + std::string(dtype_traits<Tw>::getName()) + std::string("_") + std::string(dtype_traits<To>::getName()) + std::string("_") + std::to_string(threads_x) + std::string("_") + std::to_string(input_weight) + std::string("_") + std::to_string(output_weight); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { Binary<To, af_add_t> mean; ToNumStr<To> toNumStr; ToNumStr<Tw> twNumStr; Transform<uint, Tw, af_add_t> transform_weight; std::ostringstream options; options << " -D Ti=" << dtype_traits<Ti>::getName() << " -D Tw=" << dtype_traits<Tw>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D DIMX=" << threads_x << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP << " -D init_To=" << toNumStr(mean.init()) << " -D init_Tw=" << twNumStr(transform_weight(0)) << " -D one_Tw=" << twNumStr(transform_weight(1)); if (input_weight) { options << " -D INPUT_WEIGHT"; } if (output_weight) { options << " -D OUTPUT_WEIGHT"; } if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value || std::is_same<To, double>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {mean_ops_cl, mean_first_cl}; const int ker_lens[] = {mean_ops_cl_len, mean_first_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "mean_first_kernel"); addKernelToCache(device, ref_name, entry); } NDRange local(threads_x, THREADS_PER_GROUP / threads_x); NDRange global(groups_x * in.info.dims[2] * local[0], groups_y * in.info.dims[3] * local[1]); uint repeat = divup(in.info.dims[0], (local[0] * groups_x)); if (input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, groups_x, groups_y, repeat); } else if ( input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, groups_x, groups_y, repeat); } CL_DEBUG_FINISH(getQueue()); }
void csrmv(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; // FIXME: Find a better number based on average non zeros per row int threads = 64; std::string ref_name = std::string("csrmv_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy) + std::string("_") + std::to_string(threads); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS=" << threads; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmv_cl}; const int ker_lens[] = {csrmv_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmv_thread"); entry.ker[1] = Kernel(*entry.prog, "csrmv_block"); addKernelToCache(device, ref_name, entry); } int count = 0; cl::Buffer *counter = bufferAlloc(sizeof(int)); getQueue().enqueueWriteBuffer(*counter, CL_TRUE, 0, sizeof(int), (void *)&count); // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block bool is_csrmv_block = true; auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0]; auto csrmv_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, Buffer, KParam, T, T, Buffer>(csrmv_kernel); NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]); groups_x = std::min(groups_x, MAX_CSRMV_GROUPS); NDRange global(local[0] * groups_x, 1); csrmv_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, *rhs.data, rhs.info, alpha, beta, *counter); CL_DEBUG_FINISH(getQueue()); bufferFree(counter); }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb, cl_command_queue queue) { std::string refName = std::string("swapdblk_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {swapdblk_cl}; const int ker_lens[] = {swapdblk_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "swapdblk"); addKernelToCache(device, refName, entry); } int nblocks = n / nb; if (nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks - 1) * nb * inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks - 1) * nb * incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); cl::Buffer dAObj(dA, true); cl::Buffer dBObj(dB, true); auto swapdOp = KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer, unsigned long long, int, int>(*entry.ker); cl::CommandQueue q(queue); swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca, dBObj, dB_offset, lddb, incb); }
void complexMultiplyHelper(Param packed, Param sig, Param filter, const int baseDim, ConvolveBatchKind kind) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fftconvolveProgs; static std::map<int, Kernel*> cmKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ONE2ONE=" << (int)ONE2ONE << " -D MANY2ONE=" << (int)MANY2ONE << " -D ONE2MANY=" << (int)ONE2MANY << " -D MANY2MANY=" << (int)MANY2MANY; if ((af_dtype) dtype_traits<convT>::af_type == c32) { options << " -D CONVT=float"; } else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) { options << " -D CONVT=double" << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fftconvolve_multiply_cl, fftconvolve_multiply_cl_len, options.str()); fftconvolveProgs[device] = new Program(prog); cmKernel[device] = new Kernel(*fftconvolveProgs[device], "complex_multiply"); }); Param sig_tmp, filter_tmp; calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind); int sig_packed_elem = sig_tmp.info.strides[3] * sig_tmp.info.dims[3]; int filter_packed_elem = filter_tmp.info.strides[3] * filter_tmp.info.dims[3]; int mul_elem = (sig_packed_elem < filter_packed_elem) ? filter_packed_elem : sig_packed_elem; int blocks = divup(mul_elem, THREADS); NDRange local(THREADS); NDRange global(blocks * THREADS); // Multiply filter and signal FFT arrays auto cmOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer, KParam, const int, const int> (*cmKernel[device]); cmOp(EnqueueArgs(getQueue(), global, local), *packed.data, packed.info, *sig_tmp.data, sig_tmp.info, *filter_tmp.data, filter_tmp.info, mul_elem, (int)kind); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void nearest_neighbour(Param idx, Param dist, Param query, Param train, const dim_t dist_dim, const unsigned n_dist) { try { const unsigned feat_len = query.info.dims[dist_dim]; const To max_dist = maxval<To>(); // Determine maximum feat_len capable of using shared memory (faster) cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t ltrain_sz = THREADS * feat_len * sizeof(T); bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false; size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef; unsigned unroll_len = nextpow2(feat_len); if (unroll_len != feat_len) unroll_len = 0; std::string ref_name = std::string("knn_") + std::to_string(dist_type) + std::string("_") + std::to_string(use_lmem) + std::string("_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(unroll_len); int device = getActiveDeviceId(); kc_t::iterator cache_idx = kernelCaches[device].find(ref_name); kc_entry_t entry; if (cache_idx == kernelCaches[device].end()) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D THREADS=" << THREADS << " -D FEAT_LEN=" << unroll_len; switch(dist_type) { case AF_SAD: options <<" -D DISTOP=_sad_"; break; case AF_SSD: options <<" -D DISTOP=_ssd_"; break; case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__"; break; default: break; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (use_lmem) options << " -D USE_LOCAL_MEM"; cl::Program prog; buildProgram(prog, nearest_neighbour_cl, nearest_neighbour_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[3]; entry.ker[0] = Kernel(*entry.prog, "nearest_neighbour_unroll"); entry.ker[1] = Kernel(*entry.prog, "nearest_neighbour"); entry.ker[2] = Kernel(*entry.prog, "select_matches"); kernelCaches[device][ref_name] = entry; } else { entry = cache_idx->second; } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.info.dims[sample_dim]; const unsigned ntrain = train.info.dims[sample_dim]; unsigned nblk = divup(ntrain, THREADS); const NDRange local(THREADS, 1); const NDRange global(nblk * THREADS, 1); cl::Buffer *d_blk_idx = bufferAlloc(nblk * nquery * sizeof(unsigned)); cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To)); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (unroll_len > 0) { auto huOp = KernelFunctor<Buffer, Buffer, Buffer, KParam, Buffer, KParam, const To, LocalSpaceArg> (entry.ker[0]); huOp(EnqueueArgs(getQueue(), global, local), *d_blk_idx, *d_blk_dist, *query.data, query.info, *train.data, train.info, max_dist, cl::Local(lmem_sz)); } else { auto hmOp = KernelFunctor<Buffer, Buffer, Buffer, KParam, Buffer, KParam, const To, const unsigned, LocalSpaceArg> (entry.ker[1]); hmOp(EnqueueArgs(getQueue(), global, local), *d_blk_idx, *d_blk_dist, *query.data, query.info, *train.data, train.info, max_dist, feat_len, cl::Local(lmem_sz)); } CL_DEBUG_FINISH(getQueue()); const NDRange local_sm(32, 8); const NDRange global_sm(divup(nquery, 32) * 32, 8); // Reduce all smallest Hamming distances from each block and store final // best match auto smOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer, const unsigned, const unsigned, const To> (entry.ker[2]); smOp(EnqueueArgs(getQueue(), global_sm, local_sm), *idx.data, *dist.data, *d_blk_idx, *d_blk_dist, nquery, nblk, max_dist); CL_DEBUG_FINISH(getQueue()); bufferFree(d_blk_idx); bufferFree(d_blk_dist); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void packDataHelper(Param packed, Param sig, Param filter, const int baseDim, ConvolveBatchKind kind) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fftconvolveProgs; static std::map<int, Kernel*> pdKernel; static std::map<int, Kernel*> paKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if ((af_dtype) dtype_traits<convT>::af_type == c32) { options << " -D CONVT=float"; } else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) { options << " -D CONVT=double" << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fftconvolve_pack_cl, fftconvolve_pack_cl_len, options.str()); fftconvolveProgs[device] = new Program(prog); pdKernel[device] = new Kernel(*fftconvolveProgs[device], "pack_data"); paKernel[device] = new Kernel(*fftconvolveProgs[device], "pad_array"); }); Param sig_tmp, filter_tmp; calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind); int sig_packed_elem = sig_tmp.info.strides[3] * sig_tmp.info.dims[3]; int filter_packed_elem = filter_tmp.info.strides[3] * filter_tmp.info.dims[3]; // Number of packed complex elements in dimension 0 int sig_half_d0 = divup(sig.info.dims[0], 2); int sig_half_d0_odd = sig.info.dims[0] % 2; int blocks = divup(sig_packed_elem, THREADS); // Locate features kernel sizes NDRange local(THREADS); NDRange global(blocks * THREADS); // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s auto pdOp = make_kernel<Buffer, KParam, Buffer, KParam, const int, const int> (*pdKernel[device]); pdOp(EnqueueArgs(getQueue(), global, local), *sig_tmp.data, sig_tmp.info, *sig.data, sig.info, sig_half_d0, sig_half_d0_odd); CL_DEBUG_FINISH(getQueue()); blocks = divup(filter_packed_elem, THREADS); global = NDRange(blocks * THREADS); // Pad filter array with 0s auto paOp = make_kernel<Buffer, KParam, Buffer, KParam> (*paKernel[device]); paOp(EnqueueArgs(getQueue(), global, local), *filter_tmp.data, filter_tmp.info, *filter.data, filter.info); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void morph(Param out, const Param in, const Param mask) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> morProgs; static std::map<int, Kernel*> morKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { ToNumStr<T> toNumStr; T init = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init(); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D isDilation="<< isDilation << " -D init=" << toNumStr(init) << " -D windLen=" << windLen; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, morph_cl, morph_cl_len, options.str()); morProgs[device] = new Program(prog); morKernels[device] = new Kernel(*morProgs[device], "morph"); }); auto morphOp = KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, cl::LocalSpaceArg, int, int >(*morKernels[device]); NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(in.info.dims[0], THREADS_X); int blk_y = divup(in.info.dims[1], THREADS_Y); // launch batch * blk_x blocks along x dimension NDRange global(blk_x * THREADS_X * in.info.dims[2], blk_y * THREADS_Y * in.info.dims[3]); // copy mask/filter to constant memory cl_int se_size = sizeof(T)*windLen*windLen; cl::Buffer *mBuff = bufferAlloc(se_size); getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size); // calculate shared memory size const int halo = windLen/2; const int padding = 2*halo; const int locLen = THREADS_X + padding + 1; const int locSize = locLen * (THREADS_Y+padding); morphOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *mBuff, cl::Local(locSize*sizeof(T)), blk_x, blk_y); bufferFree(mBuff); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void orb(unsigned* out_feat, Param& x_out, Param& y_out, Param& score_out, Param& ori_out, Param& size_out, Param& desc_out, Param image, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels, const bool blur_img) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program orbProgs[DeviceManager::MAX_DEVICES]; static Kernel hrKernel[DeviceManager::MAX_DEVICES]; static Kernel kfKernel[DeviceManager::MAX_DEVICES]; static Kernel caKernel[DeviceManager::MAX_DEVICES]; static Kernel eoKernel[DeviceManager::MAX_DEVICES]; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLOCK_SIZE=" << ORB_THREADS_X; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } buildProgram(orbProgs[device], orb_cl, orb_cl_len, options.str()); hrKernel[device] = Kernel(orbProgs[device], "harris_response"); kfKernel[device] = Kernel(orbProgs[device], "keep_features"); caKernel[device] = Kernel(orbProgs[device], "centroid_angle"); eoKernel[device] = Kernel(orbProgs[device], "extract_orb"); }); unsigned patch_size = REF_PAT_SIZE; unsigned min_side = std::min(image.info.dims[0], image.info.dims[1]); unsigned max_levels = 0; float scl_sum = 0.f; for (unsigned i = 0; i < levels; i++) { min_side /= scl_fctr; // Minimum image side for a descriptor to be computed if (min_side < patch_size || max_levels == levels) break; max_levels++; scl_sum += 1.f / (float)pow(scl_fctr,(float)i); } std::vector<cl::Buffer*> d_x_pyr(max_levels); std::vector<cl::Buffer*> d_y_pyr(max_levels); std::vector<cl::Buffer*> d_score_pyr(max_levels); std::vector<cl::Buffer*> d_ori_pyr(max_levels); std::vector<cl::Buffer*> d_size_pyr(max_levels); std::vector<cl::Buffer*> d_desc_pyr(max_levels); std::vector<unsigned> feat_pyr(max_levels); unsigned total_feat = 0; // Compute number of features to keep for each level std::vector<unsigned> lvl_best(max_levels); unsigned feat_sum = 0; for (unsigned i = 0; i < max_levels-1; i++) { float lvl_scl = (float)pow(scl_fctr,(float)i); lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl); feat_sum += lvl_best[i]; } lvl_best[max_levels-1] = max_feat - feat_sum; // Maintain a reference to previous level image Param prev_img; Param lvl_img; const unsigned gauss_len = 9; T* h_gauss = nullptr; Param gauss_filter; gauss_filter.data = nullptr; for (unsigned i = 0; i < max_levels; i++) { const float lvl_scl = (float)pow(scl_fctr,(float)i); if (i == 0) { // First level is used in its original size lvl_img = image; prev_img = image; } else if (i > 0) { // Resize previous level image to current level dimensions lvl_img.info.dims[0] = round(image.info.dims[0] / lvl_scl); lvl_img.info.dims[1] = round(image.info.dims[1] / lvl_scl); lvl_img.info.strides[0] = 1; lvl_img.info.strides[1] = lvl_img.info.dims[0]; for (int k = 2; k < 4; k++) { lvl_img.info.dims[k] = 1; lvl_img.info.strides[k] = lvl_img.info.dims[k - 1] * lvl_img.info.strides[k - 1]; } lvl_img.info.offset = 0; lvl_img.data = bufferAlloc(lvl_img.info.dims[3] * lvl_img.info.strides[3] * sizeof(T)); resize<T, AF_INTERP_BILINEAR>(lvl_img, prev_img); if (i > 1) bufferFree(prev_img.data); prev_img = lvl_img; } unsigned lvl_feat = 0; Param d_x_feat, d_y_feat, d_score_feat; // Round feature size to nearest odd integer float size = 2.f * floor(patch_size / 2.f) + 1.f; // Avoid keeping features that might be too wide and might not fit on // the image, sqrt(2.f) is the radius when angle is 45 degrees and // represents widest case possible unsigned edge = ceil(size * sqrt(2.f) / 2.f); // Detect FAST features fast<T, 9, true>(&lvl_feat, d_x_feat, d_y_feat, d_score_feat, lvl_img, fast_thr, 0.15f, edge); if (lvl_feat == 0) { feat_pyr[i] = 0; if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); continue; } bufferFree(d_score_feat.data); unsigned usable_feat = 0; cl::Buffer* d_usable_feat = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat); cl::Buffer* d_x_harris = bufferAlloc(lvl_feat * sizeof(float)); cl::Buffer* d_y_harris = bufferAlloc(lvl_feat * sizeof(float)); cl::Buffer* d_score_harris = bufferAlloc(lvl_feat * sizeof(float)); // Calculate Harris responses // Good block_size >= 7 (must be an odd number) const dim_type blk_x = divup(lvl_feat, ORB_THREADS_X); const NDRange local(ORB_THREADS_X, ORB_THREADS_Y); const NDRange global(blk_x * ORB_THREADS_X, ORB_THREADS_Y); unsigned block_size = 7; float k_thr = 0.04f; auto hrOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, const unsigned, Buffer, Buffer, KParam, const unsigned, const float, const unsigned> (hrKernel[device]); hrOp(EnqueueArgs(getQueue(), global, local), *d_x_harris, *d_y_harris, *d_score_harris, *d_x_feat.data, *d_y_feat.data, lvl_feat, *d_usable_feat, *lvl_img.data, lvl_img.info, block_size, k_thr, patch_size); CL_DEBUG_FINISH(getQueue()); getQueue().enqueueReadBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat); bufferFree(d_x_feat.data); bufferFree(d_y_feat.data); bufferFree(d_usable_feat); if (usable_feat == 0) { feat_pyr[i] = 0; bufferFree(d_x_harris); bufferFree(d_y_harris); bufferFree(d_score_harris); if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); continue; } // Sort features according to Harris responses Param d_harris_sorted; Param d_harris_idx; d_harris_sorted.info.dims[0] = usable_feat; d_harris_idx.info.dims[0] = usable_feat; d_harris_sorted.info.strides[0] = 1; d_harris_idx.info.strides[0] = 1; for (int k = 1; k < 4; k++) { d_harris_sorted.info.dims[k] = 1; d_harris_idx.info.dims[k] = 1; d_harris_sorted.info.strides[k] = d_harris_sorted.info.dims[k - 1] * d_harris_sorted.info.strides[k - 1]; d_harris_idx.info.strides[k] = d_harris_idx.info.dims[k - 1] * d_harris_idx.info.strides[k - 1]; } d_harris_sorted.info.offset = 0; d_harris_idx.info.offset = 0; d_harris_sorted.data = d_score_harris; d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); sort0_index<float, false>(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_score_lvl = bufferAlloc(usable_feat * sizeof(float)); usable_feat = min(usable_feat, lvl_best[i]); // Keep only features with higher Harris responses const dim_type keep_blk = divup(usable_feat, ORB_THREADS); const NDRange local_keep(ORB_THREADS, 1); const NDRange global_keep(keep_blk * ORB_THREADS, 1); auto kfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, const unsigned> (kfKernel[device]); kfOp(EnqueueArgs(getQueue(), global_keep, local_keep), *d_x_lvl, *d_y_lvl, *d_score_lvl, *d_x_harris, *d_y_harris, *d_harris_sorted.data, *d_harris_idx.data, usable_feat); CL_DEBUG_FINISH(getQueue()); bufferFree(d_x_harris); bufferFree(d_y_harris); bufferFree(d_harris_sorted.data); bufferFree(d_harris_idx.data); cl::Buffer* d_ori_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_size_lvl = bufferAlloc(usable_feat * sizeof(float)); // Compute orientation of features const dim_type centroid_blk_x = divup(usable_feat, ORB_THREADS_X); const NDRange local_centroid(ORB_THREADS_X, ORB_THREADS_Y); const NDRange global_centroid(centroid_blk_x * ORB_THREADS_X, ORB_THREADS_Y); auto caOp = make_kernel<Buffer, Buffer, Buffer, const unsigned, Buffer, KParam, const unsigned> (caKernel[device]); caOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_x_lvl, *d_y_lvl, *d_ori_lvl, usable_feat, *lvl_img.data, lvl_img.info, patch_size); CL_DEBUG_FINISH(getQueue()); Param lvl_filt; Param lvl_tmp; if (blur_img) { lvl_filt = lvl_img; lvl_tmp = lvl_img; lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * lvl_filt.info.dims[1] * sizeof(T)); lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * lvl_tmp.info.dims[1] * sizeof(T)); // Calculate a separable Gaussian kernel if (h_gauss == nullptr) { h_gauss = new T[gauss_len]; gaussian1D(h_gauss, gauss_len, 2.f); gauss_filter.info.dims[0] = gauss_len; gauss_filter.info.strides[0] = 1; for (int k = 1; k < 4; k++) { gauss_filter.info.dims[k] = 1; gauss_filter.info.strides[k] = gauss_filter.info.dims[k - 1] * gauss_filter.info.strides[k - 1]; } dim_type gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3]; gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T)); getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss); } // Filter level image with Gaussian kernel to reduce noise sensitivity convolve2<T, convAccT, 0, false, gauss_len>(lvl_tmp, lvl_img, gauss_filter); convolve2<T, convAccT, 1, false, gauss_len>(lvl_filt, lvl_tmp, gauss_filter); bufferFree(lvl_tmp.data); } // Compute ORB descriptors cl::Buffer* d_desc_lvl = bufferAlloc(usable_feat * 8 * sizeof(unsigned)); unsigned* h_desc_lvl = new unsigned[usable_feat * 8]; for (int j = 0; j < (int)usable_feat * 8; j++) h_desc_lvl[j] = 0; getQueue().enqueueWriteBuffer(*d_desc_lvl, CL_TRUE, 0, usable_feat * 8 * sizeof(unsigned), h_desc_lvl); delete[] h_desc_lvl; auto eoOp = make_kernel<Buffer, const unsigned, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const float, const unsigned> (eoKernel[device]); if (blur_img) { eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_desc_lvl, usable_feat, *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl, *lvl_filt.data, lvl_filt.info, lvl_scl, patch_size); CL_DEBUG_FINISH(getQueue()); bufferFree(lvl_filt.data); } else { eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_desc_lvl, usable_feat, *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl, *lvl_img.data, lvl_img.info, lvl_scl, patch_size); CL_DEBUG_FINISH(getQueue()); } // Store results to pyramids total_feat += usable_feat; feat_pyr[i] = usable_feat; d_x_pyr[i] = d_x_lvl; d_y_pyr[i] = d_y_lvl; d_score_pyr[i] = d_score_lvl; d_ori_pyr[i] = d_ori_lvl; d_size_pyr[i] = d_size_lvl; d_desc_pyr[i] = d_desc_lvl; if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); } if (gauss_filter.data != nullptr) bufferFree(gauss_filter.data); if (h_gauss != nullptr) delete[] h_gauss; // If no features are found, set found features to 0 and return if (total_feat == 0) { *out_feat = 0; return; } // Allocate output memory x_out.info.dims[0] = total_feat; x_out.info.strides[0] = 1; y_out.info.dims[0] = total_feat; y_out.info.strides[0] = 1; score_out.info.dims[0] = total_feat; score_out.info.strides[0] = 1; ori_out.info.dims[0] = total_feat; ori_out.info.strides[0] = 1; size_out.info.dims[0] = total_feat; size_out.info.strides[0] = 1; desc_out.info.dims[0] = 8; desc_out.info.strides[0] = 1; desc_out.info.dims[1] = total_feat; desc_out.info.strides[1] = desc_out.info.dims[0]; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1]; y_out.info.dims[k] = 1; y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1]; score_out.info.dims[k] = 1; score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1]; ori_out.info.dims[k] = 1; ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1]; size_out.info.dims[k] = 1; size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1]; if (k > 1) { desc_out.info.dims[k] = 1; desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1]; } } if (total_feat > 0) { size_t out_sz = total_feat * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); ori_out.data = bufferAlloc(out_sz); size_out.data = bufferAlloc(out_sz); size_t desc_sz = total_feat * 8 * sizeof(unsigned); desc_out.data = bufferAlloc(desc_sz); } unsigned offset = 0; for (unsigned i = 0; i < max_levels; i++) { if (feat_pyr[i] == 0) continue; if (i > 0) offset += feat_pyr[i-1]; getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned)); bufferFree(d_x_pyr[i]); bufferFree(d_y_pyr[i]); bufferFree(d_score_pyr[i]); bufferFree(d_ori_pyr[i]); bufferFree(d_size_pyr[i]); bufferFree(d_desc_pyr[i]); } // Sets number of output features *out_feat = total_feat; } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, const Param filter) { try { int f0 = filter.info.dims[0]; int f1 = filter.info.dims[1]; std::string ref_name = std::string("conv2_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::string(dtype_traits<aT>::getName()) + std::string("_") + std::to_string(expand) + std::string("_") + std::to_string(f0) + std::string("_") + std::to_string(f1); int device = getActiveDeviceId(); kc_t::iterator idx = kernelCaches[device].find(ref_name); kc_entry_t entry; if (idx == kernelCaches[device].end()) { size_t LOC_SIZE = (THREADS_X+2*(f0-1))*(THREADS_Y+2*(f1-1)); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<aT>::getName() << " -D BASE_DIM="<< 2 /* hard constant specific to this convolution type */ << " -D FLEN0=" << f0 << " -D FLEN1=" << f1 << " -D EXPAND="<< expand << " -D C_SIZE="<< LOC_SIZE; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_cl, convolve_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convolve"); kernelCaches[device][ref_name] = entry; } else { entry = idx->second; } auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, KParam, int, int, int, int, int, int >(*entry.ker); convOp(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[1], param.o[2], param.s[1], param.s[2]); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }