void Integral::IntegralScan(IImage& Source, IImage& Dest) { PrepareFor(Source); CheckSameSize(Source, Dest); CheckFloat(Dest); uint Width = Source.Width(); uint Height = Source.Height(); Kernel(scan1, Source, Dest, Width, Height); if (GetNbGroupsW(Source) > 1) { make_kernel<Image2D, Image2D>(SelectProgram(Dest), "scan2") (EnqueueArgs(*m_CL, NDRange(GetNbGroupsW(Source) - 1, Source.Height())), Dest, *m_VerticalJunctions); } #undef KERNEL_RANGE #define KERNEL_RANGE(src_img) src_img.FullRange() Kernel(scan3, In(Dest, *m_VerticalJunctions), Dest); if (GetNbGroupsH(Source) > 1) { make_kernel<Image2D, Image2D>(SelectProgram(Dest), "scan4") (EnqueueArgs(*m_CL, NDRange(Source.Width(), GetNbGroupsH(Source) - 1)), Dest, *m_HorizontalJunctions); } Kernel(scan5, In(Dest, *m_HorizontalJunctions), Dest); }
void prepareKernelArgs(conv_kparam_t& param, dim_t *oDims, const dim_t *fDims, int baseDim) { int batchDims[4] = {1, 1, 1, 1}; for(int i=baseDim; i<4; ++i) { batchDims[i] = (param.launchMoreBlocks ? 1 : oDims[i]); } if (baseDim==1) { param.local = NDRange(THREADS, 1); param.nBBS0 = divup(oDims[0], THREADS); param.nBBS1 = batchDims[2]; param.global = NDRange(param.nBBS0 * THREADS * batchDims[1], param.nBBS1 * batchDims[3]); param.loc_size = (THREADS+2*(fDims[0]-1)) * sizeof(T); } else if (baseDim==2) { param.local = NDRange(THREADS_X, THREADS_Y); param.nBBS0 = divup(oDims[0], THREADS_X); param.nBBS1 = divup(oDims[1], THREADS_Y); param.global = NDRange(param.nBBS0*THREADS_X*batchDims[2], param.nBBS1*THREADS_Y*batchDims[3]); } else if (baseDim==3) { param.local = NDRange(CUBE_X, CUBE_Y, CUBE_Z); param.nBBS0 = divup(oDims[0], CUBE_X); param.nBBS1 = divup(oDims[1], CUBE_Y); int blk_z = divup(oDims[2], CUBE_Z); param.global = NDRange(param.nBBS0 * CUBE_X * batchDims[3], param.nBBS1 * CUBE_Y, blk_z * CUBE_Z); param.loc_size = (CUBE_X+2*(fDims[0]-1)) * (CUBE_Y+2*(fDims[1]-1)) * (CUBE_Z+2*(fDims[2]-1)) * sizeof(T); } }
void diff(Param out, const Param in, const unsigned indims) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> diffProgs; static std::map<int, Kernel*> diffKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D DIM=" << dim << " -D isDiff2=" << isDiff2; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, diff_cl, diff_cl_len, options.str()); diffProgs[device] = new Program(prog); diffKernels[device] = new Kernel(*diffProgs[device], "diff_kernel"); }); auto diffOp = make_kernel<Buffer, const Buffer, const KParam, const KParam, const dim_type, const dim_type, const dim_type> (*diffKernels[device]); NDRange local(TX, TY, 1); if(dim == 0 && indims == 1) { local = NDRange(TX * TY, 1, 1); } dim_type blocksPerMatX = divup(out.info.dims[0], local[0]); dim_type blocksPerMatY = divup(out.info.dims[1], local[1]); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); const dim_type oElem = out.info.dims[0] * out.info.dims[1] * out.info.dims[2] * out.info.dims[3]; diffOp(EnqueueArgs(getQueue(), global, local), *out.data, *in.data, out.info, in.info, oElem, blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void diff(Param out, const Param in, const unsigned indims) { std::string refName = std::string("diff_kernel_") + std::string(dtype_traits<T>::getName()) + std::to_string(dim) + std::to_string(isDiff2); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D DIM=" << dim << " -D isDiff2=" << isDiff2; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } const char* ker_strs[] = {diff_cl}; const int ker_lens[] = {diff_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "diff_kernel"); addKernelToCache(device, refName, entry); } auto diffOp = KernelFunctor< Buffer, const Buffer, const KParam, const KParam, const int, const int, const int> (*entry.ker); NDRange local(TX, TY, 1); if(dim == 0 && indims == 1) { local = NDRange(TX * TY, 1, 1); } int blocksPerMatX = divup(out.info.dims[0], local[0]); int blocksPerMatY = divup(out.info.dims[1], local[1]); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); const int oElem = out.info.dims[0] * out.info.dims[1] * out.info.dims[2] * out.info.dims[3]; diffOp(EnqueueArgs(getQueue(), global, local), *out.data, *in.data, out.info, in.info, oElem, blocksPerMatX, blocksPerMatY); CL_DEBUG_FINISH(getQueue()); }
void set(Buffer &ptr, T val, const size_t &elements) { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program setProgs[DeviceManager::MAX_DEVICES]; static Kernel setKernels[DeviceManager::MAX_DEVICES]; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { Program::Sources setSrc; setSrc.emplace_back(set_cl, set_cl_len); setProgs[device] = Program(getContext(), setSrc); string opt = string("-D T=") + dtype_traits<T>::getName(); setProgs[device].build(opt.c_str()); setKernels[device] = Kernel(setProgs[device], "set"); }); auto setKern = make_kernel<Buffer, T, const unsigned long>(setKernels[device]); setKern(EnqueueArgs(getQueue(), NDRange(elements)), ptr, val, elements); }
void packDataHelper(Param packed, Param sig, Param filter, const int baseDim, ConvolveBatchKind kind) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fftconvolveProgs; static std::map<int, Kernel*> pdKernel; static std::map<int, Kernel*> paKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if ((af_dtype) dtype_traits<convT>::af_type == c32) { options << " -D CONVT=float"; } else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) { options << " -D CONVT=double" << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fftconvolve_pack_cl, fftconvolve_pack_cl_len, options.str()); fftconvolveProgs[device] = new Program(prog); pdKernel[device] = new Kernel(*fftconvolveProgs[device], "pack_data"); paKernel[device] = new Kernel(*fftconvolveProgs[device], "pad_array"); }); Param sig_tmp, filter_tmp; calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind); int sig_packed_elem = sig_tmp.info.strides[3] * sig_tmp.info.dims[3]; int filter_packed_elem = filter_tmp.info.strides[3] * filter_tmp.info.dims[3]; // Number of packed complex elements in dimension 0 int sig_half_d0 = divup(sig.info.dims[0], 2); int sig_half_d0_odd = sig.info.dims[0] % 2; int blocks = divup(sig_packed_elem, THREADS); // Locate features kernel sizes NDRange local(THREADS); NDRange global(blocks * THREADS); // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s auto pdOp = make_kernel<Buffer, KParam, Buffer, KParam, const int, const int> (*pdKernel[device]); pdOp(EnqueueArgs(getQueue(), global, local), *sig_tmp.data, sig_tmp.info, *sig.data, sig.info, sig_half_d0, sig_half_d0_odd); CL_DEBUG_FINISH(getQueue()); blocks = divup(filter_packed_elem, THREADS); global = NDRange(blocks * THREADS); // Pad filter array with 0s auto paOp = make_kernel<Buffer, KParam, Buffer, KParam> (*paKernel[device]); paOp(EnqueueArgs(getQueue(), global, local), *filter_tmp.data, filter_tmp.info, *filter.data, filter.info); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }