Пример #1
0
void Integral::IntegralScan(IImage& Source, IImage& Dest)
{
   PrepareFor(Source);

   CheckSameSize(Source, Dest);
   CheckFloat(Dest);

   uint Width = Source.Width();
   uint Height = Source.Height();

   Kernel(scan1, Source, Dest, Width, Height);

   if (GetNbGroupsW(Source) > 1)
   {
      make_kernel<Image2D, Image2D>(SelectProgram(Dest), "scan2")
         (EnqueueArgs(*m_CL, NDRange(GetNbGroupsW(Source) - 1, Source.Height())), Dest, *m_VerticalJunctions);
   }

#undef KERNEL_RANGE
#define KERNEL_RANGE(src_img) src_img.FullRange()

   Kernel(scan3, In(Dest, *m_VerticalJunctions), Dest);

   if (GetNbGroupsH(Source) > 1)
   {
      make_kernel<Image2D, Image2D>(SelectProgram(Dest), "scan4")
         (EnqueueArgs(*m_CL, NDRange(Source.Width(), GetNbGroupsH(Source) - 1)), Dest, *m_HorizontalJunctions);
   }

   Kernel(scan5, In(Dest, *m_HorizontalJunctions), Dest);
}
void prepareKernelArgs(conv_kparam_t& param, dim_t *oDims,
                       const dim_t *fDims, int baseDim)
{
    int batchDims[4] = {1, 1, 1, 1};
    for(int i=baseDim; i<4; ++i) {
        batchDims[i] = (param.launchMoreBlocks ? 1 : oDims[i]);
    }

    if (baseDim==1) {
        param.local    = NDRange(THREADS, 1);
        param.nBBS0    = divup(oDims[0], THREADS);
        param.nBBS1    = batchDims[2];
        param.global   = NDRange(param.nBBS0 * THREADS * batchDims[1], param.nBBS1 * batchDims[3]);
        param.loc_size = (THREADS+2*(fDims[0]-1)) * sizeof(T);
    } else if (baseDim==2) {
        param.local    = NDRange(THREADS_X, THREADS_Y);
        param.nBBS0    = divup(oDims[0], THREADS_X);
        param.nBBS1    = divup(oDims[1], THREADS_Y);
        param.global   = NDRange(param.nBBS0*THREADS_X*batchDims[2],
                                 param.nBBS1*THREADS_Y*batchDims[3]);
    } else if (baseDim==3) {
        param.local    = NDRange(CUBE_X, CUBE_Y, CUBE_Z);
        param.nBBS0    = divup(oDims[0], CUBE_X);
        param.nBBS1    = divup(oDims[1], CUBE_Y);
        int blk_z = divup(oDims[2], CUBE_Z);
        param.global   = NDRange(param.nBBS0 * CUBE_X * batchDims[3],
                                 param.nBBS1 * CUBE_Y,
                                 blk_z * CUBE_Z);
        param.loc_size = (CUBE_X+2*(fDims[0]-1)) * (CUBE_Y+2*(fDims[1]-1)) *
                         (CUBE_Z+2*(fDims[2]-1)) * sizeof(T);
    }
}
Пример #3
0
        void diff(Param out, const Param in, const unsigned indims)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>   diffProgs;
                static std::map<int, Kernel*>  diffKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D T="        << dtype_traits<T>::getName()
                            << " -D DIM="      << dim
                            << " -D isDiff2=" << isDiff2;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, diff_cl, diff_cl_len, options.str());
                    diffProgs[device]   = new Program(prog);
                    diffKernels[device] = new Kernel(*diffProgs[device], "diff_kernel");
                });

                auto diffOp = make_kernel<Buffer, const Buffer, const KParam, const KParam,
                                          const dim_type, const dim_type, const dim_type>
                                          (*diffKernels[device]);

                NDRange local(TX, TY, 1);
                if(dim == 0 && indims == 1) {
                    local = NDRange(TX * TY, 1, 1);
                }

                dim_type blocksPerMatX = divup(out.info.dims[0], local[0]);
                dim_type blocksPerMatY = divup(out.info.dims[1], local[1]);
                NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                               local[1] * blocksPerMatY * out.info.dims[3],
                               1);

                const dim_type oElem = out.info.dims[0] * out.info.dims[1]
                                     * out.info.dims[2] * out.info.dims[3];

                diffOp(EnqueueArgs(getQueue(), global, local),
                       *out.data, *in.data, out.info, in.info,
                       oElem, blocksPerMatX, blocksPerMatY);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
Пример #4
0
void diff(Param out, const Param in, const unsigned indims)
{
    std::string refName = std::string("diff_kernel_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(dim) +
        std::to_string(isDiff2);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T="        << dtype_traits<T>::getName()
                << " -D DIM="      << dim
                << " -D isDiff2=" << isDiff2;
        if (std::is_same<T, double>::value ||
            std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {diff_cl};
        const int   ker_lens[] = {diff_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "diff_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto diffOp = KernelFunctor< Buffer, const Buffer, const KParam, const KParam,
                                 const int, const int, const int> (*entry.ker);

    NDRange local(TX, TY, 1);
    if(dim == 0 && indims == 1) {
        local = NDRange(TX * TY, 1, 1);
    }

    int blocksPerMatX = divup(out.info.dims[0], local[0]);
    int blocksPerMatY = divup(out.info.dims[1], local[1]);
    NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                   local[1] * blocksPerMatY * out.info.dims[3], 1);

    const int oElem = out.info.dims[0] * out.info.dims[1] * out.info.dims[2] * out.info.dims[3];

    diffOp(EnqueueArgs(getQueue(), global, local),
           *out.data, *in.data, out.info, in.info, oElem, blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
Пример #5
0
void
set(Buffer &ptr, T val, const size_t &elements)
{
    static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
    static Program            setProgs[DeviceManager::MAX_DEVICES];
    static Kernel           setKernels[DeviceManager::MAX_DEVICES];

    int device = getActiveDeviceId();

    std::call_once( compileFlags[device], [device] () {
                Program::Sources setSrc;
                setSrc.emplace_back(set_cl, set_cl_len);

                setProgs[device] = Program(getContext(), setSrc);

                string opt = string("-D T=") + dtype_traits<T>::getName();
                setProgs[device].build(opt.c_str());

                setKernels[device] = Kernel(setProgs[device], "set");
            });

    auto setKern = make_kernel<Buffer, T, const unsigned long>(setKernels[device]);
    setKern(EnqueueArgs(getQueue(), NDRange(elements)), ptr, val, elements);
}
Пример #6
0
void packDataHelper(Param packed,
                    Param sig,
                    Param filter,
                    const int baseDim,
                    ConvolveBatchKind kind)
{
    try {
        static std::once_flag     compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>  fftconvolveProgs;
        static std::map<int, Kernel*>   pdKernel;
        static std::map<int, Kernel*>   paKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();

                if ((af_dtype) dtype_traits<convT>::af_type == c32) {
                    options << " -D CONVT=float";
                }
                else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) {
                    options << " -D CONVT=double"
                            << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog, fftconvolve_pack_cl, fftconvolve_pack_cl_len, options.str());
                fftconvolveProgs[device] = new Program(prog);

                pdKernel[device] = new Kernel(*fftconvolveProgs[device], "pack_data");
                paKernel[device] = new Kernel(*fftconvolveProgs[device], "pad_array");
            });

        Param sig_tmp, filter_tmp;
        calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind);

        int sig_packed_elem = sig_tmp.info.strides[3] * sig_tmp.info.dims[3];
        int filter_packed_elem = filter_tmp.info.strides[3] * filter_tmp.info.dims[3];

        // Number of packed complex elements in dimension 0
        int sig_half_d0 = divup(sig.info.dims[0], 2);
        int sig_half_d0_odd = sig.info.dims[0] % 2;

        int blocks = divup(sig_packed_elem, THREADS);

        // Locate features kernel sizes
        NDRange local(THREADS);
        NDRange global(blocks * THREADS);

        // Pack signal in a complex matrix where first dimension is half the input
        // (allows faster FFT computation) and pad array to a power of 2 with 0s
        auto pdOp = make_kernel<Buffer, KParam,
                                Buffer, KParam,
                                const int, const int> (*pdKernel[device]);

        pdOp(EnqueueArgs(getQueue(), global, local),
             *sig_tmp.data, sig_tmp.info, *sig.data, sig.info,
             sig_half_d0, sig_half_d0_odd);
        CL_DEBUG_FINISH(getQueue());

        blocks = divup(filter_packed_elem, THREADS);
        global = NDRange(blocks * THREADS);

        // Pad filter array with 0s
        auto paOp = make_kernel<Buffer, KParam,
                                Buffer, KParam> (*paKernel[device]);

        paOp(EnqueueArgs(getQueue(), global, local),
             *filter_tmp.data, filter_tmp.info,
             *filter.data, filter.info);
        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}