static void get_out_idx(Buffer *out_data,
                        Param &otmp, Param &rtmp,
                        Param &in, uint threads_x,
                        uint groups_x, uint groups_y)
{
    std::string refName = std::string("get_out_idx_kernel_") + std::string(dtype_traits<T>::getName());

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        ToNumStr<T> toNumStr;
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D zero=" << toNumStr(scalar<T>(0))
                << " -D CPLX=" << af::iscplx<T>();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {where_cl};
        const int   ker_lens[] = {where_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "get_out_idx_kernel");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
    NDRange global(local[0] * groups_x * in.info.dims[2], local[1] * groups_y * in.info.dims[3]);

    uint lim = divup(otmp.info.dims[0], (threads_x * groups_x));

    auto whereOp = KernelFunctor< Buffer, Buffer, KParam, Buffer, KParam,
                                  Buffer, KParam, uint, uint, uint>(*entry.ker);

    whereOp(EnqueueArgs(getQueue(), global, local),
            *out_data, *otmp.data, otmp.info,
            *rtmp.data, rtmp.info, *in.data, in.info,
            groups_x, groups_y, lim);

    CL_DEBUG_FINISH(getQueue());
}
void laset(int m, int  n,
           T offdiag, T diag,
           cl_mem dA, size_t dA_offset, magma_int_t ldda)
{
    std::string refName = laset_name<uplo>() + std::string("_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(uplo);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D BLK_X=" << BLK_X
                << " -D BLK_Y=" << BLK_Y
                << " -D IS_CPLX=" << af::iscplx<T>();

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {laset_cl};
        const int   ker_lens[] = {laset_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, laset_name<uplo>());

        addKernelToCache(device, refName, entry);
    }

    int groups_x = (m - 1) / BLK_X + 1;
    int groups_y = (n - 1) / BLK_Y + 1;

    NDRange local(BLK_X, 1);
    NDRange global(groups_x * local[0], groups_y * local[1]);

    // retain the cl_mem object during cl::Buffer creation
    cl::Buffer dAObj(dA, true);

    auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker);

    lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda);
}
Exemple #3
0
        void iota(Param out)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>  iotaProgs;
                static std::map<int, Kernel*> iotaKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D T=" << dtype_traits<T>::getName();
                    options << " -D rep=" << rep;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, iota_cl, iota_cl_len, options.str());
                    iotaProgs[device]   = new Program(prog);
                    iotaKernels[device] = new Kernel(*iotaProgs[device], "iota_kernel");
                });

                auto iotaOp = make_kernel<Buffer, const KParam, const dim_type, const dim_type>
                                         (*iotaKernels[device]);

                NDRange local(TX, TY, 1);

                dim_type blocksPerMatX = divup(out.info.dims[0], TILEX);
                dim_type blocksPerMatY = divup(out.info.dims[1], TILEY);
                NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                               local[1] * blocksPerMatY * out.info.dims[3],
                               1);

                iotaOp(EnqueueArgs(getQueue(), global, local),
                       *out.data, out.info, blocksPerMatX, blocksPerMatY);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
void triangle(Param out, const Param in)
{
    std::string refName = std::string("triangle_kernel_") + std::string(dtype_traits<T>::getName()) +
        std::to_string(is_upper) + std::to_string(is_unit_diag);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D is_upper=" << is_upper
                << " -D is_unit_diag=" << is_unit_diag
                << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")"
                << " -D ONE=(T)(" << scalar_to_option(scalar<T>(1)) << ")";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {triangle_cl};
        const int   ker_lens[] = {triangle_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "triangle_kernel");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(TX, TY);

    int groups_x = divup(out.info.dims[0], TILEX);
    int groups_y = divup(out.info.dims[1], TILEY);

    NDRange global(groups_x * out.info.dims[2] * local[0], groups_y * out.info.dims[3] * local[1]);

    auto triangleOp = KernelFunctor< Buffer, KParam, const Buffer, KParam,
                                     const int, const int >(*entry.ker);

    triangleOp(EnqueueArgs(getQueue(), global, local),
               *out.data, out.info, *in.data, in.info, groups_x, groups_y);

    CL_DEBUG_FINISH(getQueue());
}
void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims)
{
    std::string refName = std::string("iota_kernel_") + std::string(dtype_traits<T>::getName());

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;

        options << " -D T=" << dtype_traits<T>::getName();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {iota_cl};
        const int   ker_lens[] = {iota_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "iota_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto iotaOp = KernelFunctor<Buffer, const KParam,
                                const int, const int, const int, const int,
                                const int, const int, const int, const int,
                                const int, const int> (*entry.ker);

    NDRange local(IOTA_TX, IOTA_TY, 1);

    int blocksPerMatX = divup(out.info.dims[0], TILEX);
    int blocksPerMatY = divup(out.info.dims[1], TILEY);
    NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                   local[1] * blocksPerMatY * out.info.dims[3], 1);

    iotaOp(EnqueueArgs(getQueue(), global, local),
           *out.data, out.info, sdims[0], sdims[1], sdims[2], sdims[3],
           tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
Exemple #6
0
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter)
{
    try {
        static std::once_flag  compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> convProgs;
        static std::map<int, Kernel*>  convKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D accType="<< dtype_traits<aT>::getName()
                            << " -D BASE_DIM="<< bDim
                            << " -D EXPAND=" << expand;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, convolve_cl, convolve_cl_len, options.str());
                    convProgs[device]   = new Program(prog);
                    convKernels[device] = new Kernel(*convProgs[device], "convolve");
                });

        auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam,
                                        cl::LocalSpaceArg, Buffer, KParam,
                                        int, int,
                                        int, int, int,
                                        int, int, int
                                       >(*convKernels[device]);

        convOp(EnqueueArgs(getQueue(), param.global, param.local),
                *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size),
                *param.impulse, filter.info, param.nBBS0, param.nBBS1,
                param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]);

    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Exemple #7
0
void susan(cl::Buffer* out, const cl::Buffer* in, const unsigned in_off,
           const unsigned idim0, const unsigned idim1, const float t,
           const float g, const unsigned edge) {
    std::string refName = std::string("susan_responses_") +
                          std::string(dtype_traits<T>::getName()) +
                          std::to_string(radius);

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        const size_t LOCAL_MEM_SIZE =
            (SUSAN_THREADS_X + 2 * radius) * (SUSAN_THREADS_Y + 2 * radius);
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE
                << " -D BLOCK_X=" << SUSAN_THREADS_X
                << " -D BLOCK_Y=" << SUSAN_THREADS_Y << " -D RADIUS=" << radius
                << " -D RESPONSE";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {susan_cl};
        const int ker_lens[]   = {susan_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "susan_responses");

        addKernelToCache(device, refName, entry);
    }

    auto susanOp = KernelFunctor<Buffer, Buffer, unsigned, unsigned, unsigned,
                                 float, float, unsigned>(*entry.ker);

    NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
    NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0],
                   divup(idim1 - 2 * edge, local[1]) * local[1]);

    susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, in_off, idim0,
            idim1, t, g, edge);
}
void hsv2rgb_convert(Param out, const Param in)
{
    std::string refName = std::string("hsvrgb_convert_") +
        std::string(dtype_traits<T>::getName()) + std::to_string(isHSV2RGB);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();

        if(isHSV2RGB) options << " -D isHSV2RGB";
        if (std::is_same<T, double>::value) options << " -D USE_DOUBLE";

        const char* ker_strs[] = {hsv_rgb_cl};
        const int   ker_lens[] = {hsv_rgb_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "convert");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(THREADS_X, THREADS_Y);

    int blk_x = divup(in.info.dims[0], THREADS_X);
    int blk_y = divup(in.info.dims[1], THREADS_Y);

    // all images are three channels, so batch
    // parameter would be along 4th dimension
    NDRange global(blk_x * in.info.dims[3] * THREADS_X, blk_y * THREADS_Y);

    auto hsvrgbOp = KernelFunctor<Buffer, KParam, Buffer, KParam, int> (*entry.ker);

    hsvrgbOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, blk_x);

    CL_DEBUG_FINISH(getQueue());
}
Exemple #9
0
static void identity(Param out) {
    std::string refName = std::string("identity_kernel") +
                          std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D ONE=(T)("
                << scalar_to_option(scalar<T>(1)) << ")"
                << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {identity_cl};
        const int ker_lens[]   = {identity_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "identity_kernel");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(32, 8);
    int groups_x = divup(out.info.dims[0], local[0]);
    int groups_y = divup(out.info.dims[1], local[1]);
    NDRange global(groups_x * out.info.dims[2] * local[0],
                   groups_y * out.info.dims[3] * local[1]);

    auto identityOp = KernelFunctor<Buffer, const KParam, int, int>(*entry.ker);

    identityOp(EnqueueArgs(getQueue(), global, local), *(out.data), out.info,
               groups_x, groups_y);

    CL_DEBUG_FINISH(getQueue());
}
Exemple #10
0
    static void bcast_dim_launcher(Param &out,
                                   Param &tmp,
                                   const uint groups_all[4])
    {

        Kernel ker = get_scan_dim_kernels<Ti, To, op, dim, isFinalPass, threads_y>(1);

        NDRange local(THREADS_X, threads_y);
        NDRange global(groups_all[0] * groups_all[2] * local[0],
                       groups_all[1] * groups_all[3] * local[1]);

        uint lim = divup(out.info.dims[dim], (threads_y * groups_all[dim]));

        auto bcastOp = make_kernel<Buffer, KParam,
                                   Buffer, KParam,
                                   uint, uint,
                                   uint, uint>(ker);

        bcastOp(EnqueueArgs(getQueue(), global, local),
                out.data, out.info, tmp.data, tmp.info,
                groups_all[0], groups_all[1], groups_all[dim], lim);

        CL_DEBUG_FINISH(getQueue());
    }
Exemple #11
0
void
set(Buffer &ptr, T val, const size_t &elements)
{
    static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
    static Program            setProgs[DeviceManager::MAX_DEVICES];
    static Kernel           setKernels[DeviceManager::MAX_DEVICES];

    int device = getActiveDeviceId();

    std::call_once( compileFlags[device], [device] () {
                Program::Sources setSrc;
                setSrc.emplace_back(set_cl, set_cl_len);

                setProgs[device] = Program(getContext(), setSrc);

                string opt = string("-D T=") + dtype_traits<T>::getName();
                setProgs[device].build(opt.c_str());

                setKernels[device] = Kernel(setProgs[device], "set");
            });

    auto setKern = make_kernel<Buffer, T, const unsigned long>(setKernels[device]);
    setKern(EnqueueArgs(getQueue(), NDRange(elements)), ptr, val, elements);
}
Exemple #12
0
    void copy(Param dst, const Param src, int ndims, outType default_value, double factor)
    {
        try {
            static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
            static std::map<int, Program*>    cpyProgs;
            static std::map<int, Kernel*>   cpyKernels;

            int device = getActiveDeviceId();

            std::call_once(compileFlags[device], [&]() {

                        std::ostringstream options;
                        options << " -D inType=" << dtype_traits<inType>::getName()
                            << " -D outType=" << dtype_traits<outType>::getName()
                            << " -D inType_" << dtype_traits<inType>::getName()
                            << " -D outType_" << dtype_traits<outType>::getName()
                            << " -D SAME_DIMS=" << same_dims;
                        if (std::is_same<inType, double>::value  ||
                            std::is_same<inType, cdouble>::value ||
                            std::is_same<outType, double>::value ||
                            std::is_same<outType, cdouble>::value) {
                            options << " -D USE_DOUBLE";
                        }

                        Program prog;
                        buildProgram(prog, copy_cl, copy_cl_len, options.str());
                        cpyProgs[device]   = new Program(prog);
                        cpyKernels[device] = new Kernel(*cpyProgs[device], "copy");
                    });

            NDRange local(DIM0, DIM1);
            size_t local_size[] = {DIM0, DIM1};

            local_size[0] *= local_size[1];
            if (ndims == 1) {
                local_size[1] = 1;
            }

            int blk_x = divup(dst.info.dims[0], local_size[0]);
            int blk_y = divup(dst.info.dims[1], local_size[1]);

            NDRange global(blk_x * dst.info.dims[2] * DIM0,
                    blk_y * dst.info.dims[3] * DIM1);

            dims_t trgt_dims;
            if (same_dims) {
                trgt_dims= {{dst.info.dims[0], dst.info.dims[1], dst.info.dims[2], dst.info.dims[3]}};
            } else {
                dim_t trgt_l = std::min(dst.info.dims[3], src.info.dims[3]);
                dim_t trgt_k = std::min(dst.info.dims[2], src.info.dims[2]);
                dim_t trgt_j = std::min(dst.info.dims[1], src.info.dims[1]);
                dim_t trgt_i = std::min(dst.info.dims[0], src.info.dims[0]);
                trgt_dims= {{trgt_i, trgt_j, trgt_k, trgt_l}};
            }

            auto copyOp = KernelFunctor<Buffer, KParam, Buffer, KParam,
                                      outType, float, dims_t,
                                      int, int
                                     >(*cpyKernels[device]);

            copyOp(EnqueueArgs(getQueue(), global, local),
                   *dst.data, dst.info, *src.data, src.info,
                   default_value, (float)factor, trgt_dims, blk_x, blk_y);
            CL_DEBUG_FINISH(getQueue());
        } catch (cl::Error err) {
            CL_TO_AF_ERROR(err);
            throw;
        }
    }
Exemple #13
0
void csrmm_nt(Param out, const Param &values, const Param &rowIdx,
              const Param &colIdx, const Param &rhs, const T alpha,
              const T beta) {
    bool use_alpha = (alpha != scalar<T>(1.0));
    bool use_beta  = (beta != scalar<T>(0.0));

    // Using greedy indexing is causing performance issues on many platforms
    // FIXME: Figure out why
    bool use_greedy = false;

    std::string ref_name = std::string("csrmm_nt_") +
                           std::string(dtype_traits<T>::getName()) +
                           std::string("_") + std::to_string(use_alpha) +
                           std::string("_") + std::to_string(use_beta) +
                           std::string("_") + std::to_string(use_greedy);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();
        options << " -D USE_ALPHA=" << use_alpha;
        options << " -D USE_BETA=" << use_beta;
        options << " -D USE_GREEDY=" << use_greedy;
        options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP;

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }
        if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) {
            options << " -D IS_CPLX=1";
        } else {
            options << " -D IS_CPLX=0";
        }

        const char *ker_strs[] = {csrmm_cl};
        const int ker_lens[]   = {csrmm_cl_len};

        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog   = new Program(prog);
        entry.ker    = new Kernel[2];
        entry.ker[0] = Kernel(*entry.prog, "csrmm_nt");
        // FIXME: Change this after adding another kernel
        entry.ker[1] = Kernel(*entry.prog, "csrmm_nt");

        addKernelToCache(device, ref_name, entry);
    }

    auto csrmm_nt_kernel = entry.ker[0];
    auto csrmm_nt_func =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam,
                      T, T, Buffer>(csrmm_nt_kernel);
    NDRange local(THREADS_PER_GROUP, 1);
    int M = rowIdx.info.dims[0] - 1;
    int N = rhs.info.dims[0];

    int groups_x = divup(N, local[0]);
    int groups_y = divup(M, REPEAT);
    groups_y     = std::min(groups_y, MAX_CSRMM_GROUPS);
    NDRange global(local[0] * groups_x, local[1] * groups_y);

    std::vector<int> count(groups_x);
    cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int));
    getQueue().enqueueWriteBuffer(
        *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data());

    csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data,
                  *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data,
                  rhs.info, alpha, beta, *counter);

    bufferFree(counter);
}
Exemple #14
0
        void approx1(Param out, const Param in, const Param pos, const float offGrid)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>  approxProgs;
                static std::map<int, Kernel*> approxKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    ToNum<Ty> toNum;
                    std::ostringstream options;
                    options << " -D Ty="        << dtype_traits<Ty>::getName()
                            << " -D Tp="        << dtype_traits<Tp>::getName()
                            << " -D ZERO="      << toNum(scalar<Ty>(0));

                    if((af_dtype) dtype_traits<Ty>::af_type == c32 ||
                       (af_dtype) dtype_traits<Ty>::af_type == c64) {
                        options << " -D CPLX=1";
                    } else {
                        options << " -D CPLX=0";
                    }
                    if (std::is_same<Ty, double>::value ||
                        std::is_same<Ty, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }

                    switch(method) {
                        case AF_INTERP_NEAREST: options << " -D INTERP=NEAREST";
                            break;
                        case AF_INTERP_LINEAR:  options << " -D INTERP=LINEAR";
                            break;
                        default:
                            break;
                    }
                    Program prog;
                    buildProgram(prog, approx1_cl, approx1_cl_len, options.str());
                    approxProgs[device] = new Program(prog);

                    approxKernels[device] = new Kernel(*approxProgs[device], "approx1_kernel");
                });


                auto approx1Op = make_kernel<Buffer, const KParam, const Buffer, const KParam,
                                       const Buffer, const KParam, const float, const int>
                                      (*approxKernels[device]);

                NDRange local(THREADS, 1, 1);
                int blocksPerMat = divup(out.info.dims[0], local[0]);
                NDRange global(blocksPerMat * local[0] * out.info.dims[1],
                               out.info.dims[2] * out.info.dims[3] * local[0],
                               1);

                approx1Op(EnqueueArgs(getQueue(), global, local),
                          *out.data, out.info, *in.data, in.info,
                          *pos.data, pos.info, offGrid, blocksPerMat);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
Exemple #15
0
void fast(unsigned* out_feat,
          Param &x_out,
          Param &y_out,
          Param &score_out,
          Param in,
          const float thr,
          const float feature_ratio,
          const unsigned edge)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> fastProgs;
        static std::map<int, Kernel*>  lfKernel;
        static std::map<int, Kernel*>  nmKernel;
        static std::map<int, Kernel*>  gfKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D ARC_LENGTH=" << arc_length
                        << " -D NONMAX=" << static_cast<unsigned>(nonmax);

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog, fast_cl, fast_cl_len, options.str());
                fastProgs[device] = new Program(prog);

                lfKernel[device] = new Kernel(*fastProgs[device], "locate_features");
                nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts");
                gfKernel[device] = new Kernel(*fastProgs[device], "get_features");
            });

        const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio);

        // Matrix containing scores for detected features, scores are stored in the
        // same coordinates as features, dimensions should be equal to in.
        cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float));
        std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0);
        getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]);

        cl::Buffer *d_flags = d_score;
        if (nonmax) {
            d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T));
        }

        const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X);
        const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y);

        // Locate features kernel sizes
        const NDRange local(FAST_THREADS_X, FAST_THREADS_Y);
        const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y);

        auto lfOp = make_kernel<Buffer, KParam,
                                Buffer, const float, const unsigned,
                                LocalSpaceArg> (*lfKernel[device]);

        lfOp(EnqueueArgs(getQueue(), global, local),
             *in.data, in.info, *d_score, thr, edge,
             cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T)));
        CL_DEBUG_FINISH(getQueue());

        const int blk_nonmax_x = divup(in.info.dims[0], 64);
        const int blk_nonmax_y = divup(in.info.dims[1], 64);

        // Nonmax kernel sizes
        const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y);
        const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y);

        unsigned count_init = 0;
        cl::Buffer *d_total = bufferAlloc(sizeof(unsigned));
        getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init);

        //size_t *global_nonmax_dims = global_nonmax();
        size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned);
        cl::Buffer *d_counts  = bufferAlloc(blocks_sz);
        cl::Buffer *d_offsets = bufferAlloc(blocks_sz);

        auto nmOp = make_kernel<Buffer, Buffer, Buffer,
                                Buffer, Buffer,
                                KParam, const unsigned> (*nmKernel[device]);
        nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax),
                         *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge);
        CL_DEBUG_FINISH(getQueue());

        unsigned total;
        getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total);
        total = total < max_feat ? total : max_feat;

        if (total > 0) {
            size_t out_sz = total * sizeof(float);
            x_out.data = bufferAlloc(out_sz);
            y_out.data = bufferAlloc(out_sz);
            score_out.data = bufferAlloc(out_sz);

            auto gfOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, Buffer,
                                    KParam, const unsigned,
                                    const unsigned> (*gfKernel[device]);
            gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax),
                             *x_out.data, *y_out.data, *score_out.data,
                             *d_flags, *d_counts, *d_offsets,
                             in.info, total, edge);
            CL_DEBUG_FINISH(getQueue());
        }

        *out_feat = total;

        x_out.info.dims[0] = total;
        x_out.info.strides[0] = 1;
        y_out.info.dims[0] = total;
        y_out.info.strides[0] = 1;
        score_out.info.dims[0] = total;
        score_out.info.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            x_out.info.dims[k] = 1;
            x_out.info.strides[k] = total;
            y_out.info.dims[k] = 1;
            y_out.info.strides[k] = total;
            score_out.info.dims[k] = 1;
            score_out.info.strides[k] = total;
        }

        bufferFree(d_score);
        if (nonmax) bufferFree(d_flags);
        bufferFree(d_total);
        bufferFree(d_counts);
        bufferFree(d_offsets);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
void reorderOutputHelper(Param out,
                         Param packed,
                         Param sig,
                         Param filter,
                         const int baseDim,
                         ConvolveBatchKind kind)
{
    try {
        static std::once_flag     compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> fftconvolveProgs;
        static std::map<int, Kernel*>  roKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D ROUND_OUT=" << (int)roundOut
                        << " -D EXPAND=" << (int)expand;

                if ((af_dtype) dtype_traits<convT>::af_type == c32) {
                    options << " -D CONVT=float";
                }
                else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) {
                    options << " -D CONVT=double"
                            << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog,
                             fftconvolve_reorder_cl,
                             fftconvolve_reorder_cl_len,
                             options.str());
                fftconvolveProgs[device] = new Program(prog);

                roKernel[device] = new Kernel(*fftconvolveProgs[device], "reorder_output");
            });

        Param sig_tmp, filter_tmp;
        calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind);

        // Number of packed complex elements in dimension 0
        int sig_half_d0 = divup(sig.info.dims[0], 2);

        int blocks = divup(out.info.strides[3] * out.info.dims[3], THREADS);

        NDRange local(THREADS);
        NDRange global(blocks * THREADS);

        auto roOp = make_kernel<Buffer, KParam,
                                Buffer, KParam,
                                KParam, const int,
                                const int> (*roKernel[device]);

        if (kind == ONE2MANY) {
            roOp(EnqueueArgs(getQueue(), global, local),
                 *out.data, out.info,
                 *filter_tmp.data, filter_tmp.info,
                 filter.info, sig_half_d0, baseDim);
        }
        else {
            roOp(EnqueueArgs(getQueue(), global, local),
                 *out.data, out.info,
                 *sig_tmp.data, sig_tmp.info,
                 filter.info, sig_half_d0, baseDim);
        }
        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Exemple #17
0
void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy,
            const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
            const dim_t nx, const bool is_column) {
    std::string ref_name = std::string("unwrap_") +
                           std::string(dtype_traits<T>::getName()) +
                           std::string("_") + std::to_string(is_column);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog == 0 && entry.ker == 0) {
        ToNumStr<T> toNumStr;
        std::ostringstream options;
        options << " -D is_column=" << is_column
                << " -D ZERO=" << toNumStr(scalar<T>(0))
                << " -D T=" << dtype_traits<T>::getName();

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        Program prog;
        buildProgram(prog, unwrap_cl, unwrap_cl_len, options.str());

        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "unwrap_kernel");

        addKernelToCache(device, ref_name, entry);
    }

    dim_t TX = 1, TY = 1;
    dim_t BX       = 1;
    const dim_t BY = out.info.dims[2] * out.info.dims[3];
    dim_t reps     = 1;

    if (is_column) {
        TX   = std::min(THREADS_PER_GROUP, nextpow2(out.info.dims[0]));
        TY   = THREADS_PER_GROUP / TX;
        BX   = divup(out.info.dims[1], TY);
        reps = divup((wx * wy), TX);
    } else {
        TX   = THREADS_X;
        TY   = THREADS_Y;
        BX   = divup(out.info.dims[0], TX);
        reps = divup((wx * wy), TY);
    }

    NDRange local(TX, TY);
    NDRange global(local[0] * BX, local[1] * BY);

    auto unwrapOp =
        KernelFunctor<Buffer, const KParam, const Buffer, const KParam,
                      const int, const int, const int, const int, const int,
                      const int, const int, const int>(*entry.ker);

    unwrapOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info,
             *in.data, in.info, wx, wy, sx, sy, px, py, nx, reps);

    CL_DEBUG_FINISH(getQueue());
}
void exampleFunc(Param c, const Param a, const Param b, const af_someenum_t p)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>  egProgs;
        static std::map<int, Kernel*> egKernels;

        int device = getActiveDeviceId();

        // std::call_once is used to ensure OpenCL kernels
        // are compiled only once for any given device and combination
        // of template parameters to this kernel wrapper function 'exampleFunc<T>'
        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();
                // You can pass any template parameters as compile options
                // to kernel the compilation step. This is equivalent of
                // having templated kernels in CUDA

                // The following option is passed to kernel compilation
                // if template parameter T is double or complex double
                // to enable FP64 extension
                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                Program prog;
                // below helper function 'buildProgram' uses the option string
                // we just created and compiles the kernel string
                // 'example_cl' which was created by our opencl kernel code obfuscation
                // stage
                buildProgram(prog, example_cl, example_cl_len, options.str());

                // create a cl::Program object on heap
                egProgs[device]   = new Program(prog);

                // create a cl::Kernel object on heap
                egKernels[device] = new Kernel(*egProgs[device], "example");
            });

        // configure work group parameters
        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(c.info.dims[0], THREADS_X);
        int blk_y = divup(c.info.dims[1], THREADS_Y);

        // configure global launch parameters
        NDRange global(blk_x * THREADS_X, blk_y * THREADS_Y);

        // create a kernel functor from the cl::Kernel object
        // corresponding to the device on which current execution
        // is happending.
        auto exampleFuncOp = KernelFunctor<Buffer, KParam, Buffer, KParam,
                                           Buffer, KParam, int>(*egKernels[device]);

        // launch the kernel
        exampleFuncOp(EnqueueArgs(getQueue(), global, local),
                    *c.data, c.info, *a.data, a.info, *b.data, b.info, (int)p);

        // Below Macro activates validations ONLY in DEBUG
        // mode as its name indicates
        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) { // Catch all cl::Errors and convert them
                              // to appropriate ArrayFire error codes
        CL_TO_AF_ERROR(err);
    }
}
Exemple #19
0
void swapdblk(int n, int nb,
              cl_mem dA, size_t dA_offset, int ldda, int inca,
              cl_mem dB, size_t dB_offset, int lddb, int incb)
{

    static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
    static std::map<int, Program*>  swpProgs;
    static std::map<int, Kernel*> swpKernels;

    int device = getActiveDeviceId();

    std::call_once(compileFlags[device], [device] () {

            std::ostringstream options;
            options << " -D T=" << dtype_traits<T>::getName();

            if (std::is_same<T, double>::value ||
                std::is_same<T, cdouble>::value) {
                options << " -D USE_DOUBLE";
            }

            cl::Program prog;
            buildProgram(prog, swapdblk_cl, swapdblk_cl_len, options.str());
            swpProgs[device] = new Program(prog);

            swpKernels[device] = new Kernel(*swpProgs[device], "swapdblk");
        });

    int nblocks = n / nb;

    if(nblocks == 0)
        return;

    int info = 0;
    if (n < 0) {
        info = -1;
    } else if (nb < 1 || nb > 1024) {
        info = -2;
    } else if (ldda < (nblocks-1)*nb*inca + nb) {
        info = -4;
    } else if (inca < 0) {
        info = -5;
    } else if (lddb < (nblocks-1)*nb*incb + nb) {
        info = -7;
    } else if (incb < 0) {
        info = -8;
    }

    if (info != 0) {
        AF_ERROR("Invalid configuration", AF_ERR_INTERNAL);
        return;
    }

    NDRange local(nb);
    NDRange global(nblocks * nb);
    auto swapdOp = make_kernel<int,
                               cl_mem, unsigned long long, int, int,
                               cl_mem, unsigned long long, int, int>(*swpKernels[device]);

    swapdOp(EnqueueArgs(getQueue(), global, local),
            nb,
            dA, dA_offset, ldda, inca,
            dB, dB_offset, lddb, incb);

}
Exemple #20
0
void mean_first_launcher(Param out, Param owt,
        Param in, Param inWeight,
        const int threads_x,
        const uint groups_x,
        const uint groups_y)
{

    bool input_weight = ((inWeight.info.dims[0] *
                          inWeight.info.dims[1] *
                          inWeight.info.dims[2] *
                          inWeight.info.dims[3]) != 0);

    bool output_weight = (( owt.info.dims[0] *
                            owt.info.dims[1] *
                            owt.info.dims[2] *
                            owt.info.dims[3]) != 0);

    std::string ref_name =
        std::string("mean_0_") +
        std::string(dtype_traits<Ti>::getName()) +
        std::string("_") +
        std::string(dtype_traits<Tw>::getName()) +
        std::string("_") +
        std::string(dtype_traits<To>::getName()) +
        std::string("_") +
        std::to_string(threads_x) +
        std::string("_") +
        std::to_string(input_weight) +
        std::string("_") +
        std::to_string(output_weight);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog==0 && entry.ker==0) {

        Binary<To, af_add_t> mean;
        ToNumStr<To> toNumStr;
        ToNumStr<Tw> twNumStr;
        Transform<uint, Tw, af_add_t> transform_weight;

        std::ostringstream options;
        options << " -D Ti=" << dtype_traits<Ti>::getName()
            << " -D Tw=" << dtype_traits<Tw>::getName()
            << " -D To=" << dtype_traits<To>::getName()
            << " -D DIMX=" << threads_x
            << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
            << " -D init_To=" << toNumStr(mean.init())
            << " -D init_Tw=" << twNumStr(transform_weight(0))
            << " -D one_Tw=" << twNumStr(transform_weight(1));

        if (input_weight) { options << " -D INPUT_WEIGHT"; }
        if (output_weight) { options << " -D OUTPUT_WEIGHT"; }

        if (std::is_same<Ti, double>::value ||
                std::is_same<Ti, cdouble>::value ||
                std::is_same<To, double>::value) {
            options << " -D USE_DOUBLE";
        }

        const char *ker_strs[] = {mean_ops_cl, mean_first_cl};
        const int   ker_lens[] = {mean_ops_cl_len, mean_first_cl_len};
        Program prog;
        buildProgram(prog, 2, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker = new Kernel(*entry.prog, "mean_first_kernel");

        addKernelToCache(device, ref_name, entry);
    }

    NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
    NDRange global(groups_x * in.info.dims[2] * local[0],
            groups_y * in.info.dims[3] * local[1]);

    uint repeat = divup(in.info.dims[0], (local[0] * groups_x));

    if (input_weight && output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    } else if ( input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight &&  output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    }

    CL_DEBUG_FINISH(getQueue());
}
        void csrmv(Param out,
                   const Param &values, const Param &rowIdx, const Param &colIdx,
                   const Param &rhs, const T alpha, const T beta)
        {
            bool use_alpha = (alpha != scalar<T>(1.0));
            bool use_beta = (beta != scalar<T>(0.0));

            // Using greedy indexing is causing performance issues on many platforms
            // FIXME: Figure out why
            bool use_greedy = false;

            // FIXME: Find a better number based on average non zeros per row
            int threads = 64;

            std::string ref_name =
                std::string("csrmv_") +
                std::string(dtype_traits<T>::getName()) +
                std::string("_") +
                std::to_string(use_alpha) +
                std::string("_") +
                std::to_string(use_beta) +
                std::string("_") +
                std::to_string(use_greedy) +
                std::string("_") +
                std::to_string(threads);

            int device = getActiveDeviceId();

            kc_entry_t entry = kernelCache(device, ref_name);

            if (entry.prog==0 && entry.ker==0) {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();
                options << " -D USE_ALPHA=" << use_alpha;
                options << " -D USE_BETA=" << use_beta;
                options << " -D USE_GREEDY=" << use_greedy;
                options << " -D THREADS=" << threads;

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                if (std::is_same<T, cfloat>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D IS_CPLX=1";
                } else {
                    options << " -D IS_CPLX=0";
                }

                const char *ker_strs[] = {csrmv_cl};
                const int   ker_lens[] = {csrmv_cl_len};

                Program prog;
                buildProgram(prog, 1, ker_strs, ker_lens, options.str());
                entry.prog = new Program(prog);
                entry.ker  = new Kernel[2];
                entry.ker[0] = Kernel(*entry.prog, "csrmv_thread");
                entry.ker[1] = Kernel(*entry.prog, "csrmv_block");

                addKernelToCache(device, ref_name, entry);
            }

            int count = 0;
            cl::Buffer *counter = bufferAlloc(sizeof(int));
            getQueue().enqueueWriteBuffer(*counter, CL_TRUE,
                                          0,
                                          sizeof(int),
                                          (void *)&count);

            // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block
            bool is_csrmv_block = true;
            auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0];
            auto csrmv_func = KernelFunctor<Buffer,
                                            Buffer, Buffer, Buffer,
                                            int,
                                            Buffer, KParam, T, T, Buffer>(csrmv_kernel);

            NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1);
            int M = rowIdx.info.dims[0] - 1;

            int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]);
            groups_x = std::min(groups_x, MAX_CSRMV_GROUPS);
            NDRange global(local[0] * groups_x, 1);

            csrmv_func(EnqueueArgs(getQueue(), global, local),
                        *out.data, *values.data, *rowIdx.data, *colIdx.data,
                        M, *rhs.data, rhs.info, alpha, beta, *counter);

            CL_DEBUG_FINISH(getQueue());
            bufferFree(counter);
        }
Exemple #22
0
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca,
              cl_mem dB, size_t dB_offset, int lddb, int incb,
              cl_command_queue queue) {
    std::string refName =
        std::string("swapdblk_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;

        options << " -D T=" << dtype_traits<T>::getName();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {swapdblk_cl};
        const int ker_lens[]   = {swapdblk_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "swapdblk");

        addKernelToCache(device, refName, entry);
    }

    int nblocks = n / nb;

    if (nblocks == 0) return;

    int info = 0;
    if (n < 0) {
        info = -1;
    } else if (nb < 1 || nb > 1024) {
        info = -2;
    } else if (ldda < (nblocks - 1) * nb * inca + nb) {
        info = -4;
    } else if (inca < 0) {
        info = -5;
    } else if (lddb < (nblocks - 1) * nb * incb + nb) {
        info = -7;
    } else if (incb < 0) {
        info = -8;
    }

    if (info != 0) {
        AF_ERROR("Invalid configuration", AF_ERR_INTERNAL);
        return;
    }

    NDRange local(nb);
    NDRange global(nblocks * nb);

    cl::Buffer dAObj(dA, true);
    cl::Buffer dBObj(dB, true);

    auto swapdOp =
        KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer,
                      unsigned long long, int, int>(*entry.ker);

    cl::CommandQueue q(queue);
    swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca,
            dBObj, dB_offset, lddb, incb);
}
void complexMultiplyHelper(Param packed,
                           Param sig,
                           Param filter,
                           const int baseDim,
                           ConvolveBatchKind kind)
{
    try {
        static std::once_flag     compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> fftconvolveProgs;
        static std::map<int, Kernel*>  cmKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D ONE2ONE=" << (int)ONE2ONE
                        << " -D MANY2ONE=" << (int)MANY2ONE
                        << " -D ONE2MANY=" << (int)ONE2MANY
                        << " -D MANY2MANY=" << (int)MANY2MANY;

                if ((af_dtype) dtype_traits<convT>::af_type == c32) {
                    options << " -D CONVT=float";
                }
                else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) {
                    options << " -D CONVT=double"
                            << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog,
                             fftconvolve_multiply_cl,
                             fftconvolve_multiply_cl_len,
                             options.str());
                fftconvolveProgs[device] = new Program(prog);

                cmKernel[device] = new Kernel(*fftconvolveProgs[device], "complex_multiply");
            });

        Param sig_tmp, filter_tmp;
        calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind);

        int sig_packed_elem = sig_tmp.info.strides[3] * sig_tmp.info.dims[3];
        int filter_packed_elem = filter_tmp.info.strides[3] * filter_tmp.info.dims[3];
        int mul_elem = (sig_packed_elem < filter_packed_elem) ?
                            filter_packed_elem : sig_packed_elem;

        int blocks = divup(mul_elem, THREADS);

        NDRange local(THREADS);
        NDRange global(blocks * THREADS);

        // Multiply filter and signal FFT arrays
        auto cmOp = make_kernel<Buffer, KParam,
                                Buffer, KParam,
                                Buffer, KParam,
                                const int, const int> (*cmKernel[device]);

        cmOp(EnqueueArgs(getQueue(), global, local),
             *packed.data, packed.info,
             *sig_tmp.data, sig_tmp.info,
             *filter_tmp.data, filter_tmp.info,
             mul_elem, (int)kind);
        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
void nearest_neighbour(Param idx,
                       Param dist,
                       Param query,
                       Param train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    try {
        const unsigned feat_len = query.info.dims[dist_dim];
        const To max_dist = maxval<To>();

        // Determine maximum feat_len capable of using shared memory (faster)
        cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
        size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
        size_t ltrain_sz = THREADS * feat_len * sizeof(T);
        bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false;
        size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef;

        unsigned unroll_len = nextpow2(feat_len);
        if (unroll_len != feat_len) unroll_len = 0;

        std::string ref_name =
            std::string("knn_") +
            std::to_string(dist_type) +
            std::string("_") +
            std::to_string(use_lmem) +
            std::string("_") +
            std::string(dtype_traits<T>::getName()) +
            std::string("_") +
            std::to_string(unroll_len);

        int device = getActiveDeviceId();
        kc_t::iterator cache_idx = kernelCaches[device].find(ref_name);

        kc_entry_t entry;
        if (cache_idx == kernelCaches[device].end()) {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D To=" << dtype_traits<To>::getName()
                        << " -D THREADS=" << THREADS
                        << " -D FEAT_LEN=" << unroll_len;

                switch(dist_type) {
                    case AF_SAD: options <<" -D DISTOP=_sad_"; break;
                    case AF_SSD: options <<" -D DISTOP=_ssd_"; break;
                    case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__";
                                 break;
                    default: break;
                }

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                if (use_lmem)
                    options << " -D USE_LOCAL_MEM";

                cl::Program prog;
                buildProgram(prog,
                             nearest_neighbour_cl,
                             nearest_neighbour_cl_len,
                             options.str());

                entry.prog = new Program(prog);
                entry.ker = new Kernel[3];

                entry.ker[0] = Kernel(*entry.prog, "nearest_neighbour_unroll");
                entry.ker[1] = Kernel(*entry.prog, "nearest_neighbour");
                entry.ker[2] = Kernel(*entry.prog, "select_matches");

                kernelCaches[device][ref_name] = entry;
        } else {
            entry = cache_idx->second;
        }

        const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

        const unsigned nquery = query.info.dims[sample_dim];
        const unsigned ntrain = train.info.dims[sample_dim];

        unsigned nblk = divup(ntrain, THREADS);
        const NDRange local(THREADS, 1);
        const NDRange global(nblk * THREADS, 1);

        cl::Buffer *d_blk_idx  = bufferAlloc(nblk * nquery * sizeof(unsigned));
        cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To));

        // For each query vector, find training vector with smallest Hamming
        // distance per CUDA block
        if (unroll_len > 0) {
            auto huOp = KernelFunctor<Buffer, Buffer,
                                    Buffer, KParam,
                                    Buffer, KParam,
                                    const To,
                                    LocalSpaceArg> (entry.ker[0]);

            huOp(EnqueueArgs(getQueue(), global, local),
                 *d_blk_idx, *d_blk_dist,
                 *query.data, query.info, *train.data, train.info,
                 max_dist, cl::Local(lmem_sz));
        }
        else {
            auto hmOp = KernelFunctor<Buffer, Buffer,
                                    Buffer, KParam,
                                    Buffer, KParam,
                                    const To, const unsigned,
                                    LocalSpaceArg> (entry.ker[1]);

            hmOp(EnqueueArgs(getQueue(), global, local),
                 *d_blk_idx, *d_blk_dist,
                 *query.data, query.info, *train.data, train.info,
                 max_dist, feat_len, cl::Local(lmem_sz));
        }
        CL_DEBUG_FINISH(getQueue());

        const NDRange local_sm(32, 8);
        const NDRange global_sm(divup(nquery, 32) * 32, 8);

        // Reduce all smallest Hamming distances from each block and store final
        // best match
        auto smOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer,
                                const unsigned, const unsigned,
                                const To> (entry.ker[2]);

        smOp(EnqueueArgs(getQueue(), global_sm, local_sm),
             *idx.data, *dist.data,
             *d_blk_idx, *d_blk_dist,
             nquery, nblk, max_dist);
        CL_DEBUG_FINISH(getQueue());

        bufferFree(d_blk_idx);
        bufferFree(d_blk_dist);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
void packDataHelper(Param packed,
                    Param sig,
                    Param filter,
                    const int baseDim,
                    ConvolveBatchKind kind)
{
    try {
        static std::once_flag     compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>  fftconvolveProgs;
        static std::map<int, Kernel*>   pdKernel;
        static std::map<int, Kernel*>   paKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();

                if ((af_dtype) dtype_traits<convT>::af_type == c32) {
                    options << " -D CONVT=float";
                }
                else if ((af_dtype) dtype_traits<convT>::af_type == c64 && isDouble) {
                    options << " -D CONVT=double"
                            << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog, fftconvolve_pack_cl, fftconvolve_pack_cl_len, options.str());
                fftconvolveProgs[device] = new Program(prog);

                pdKernel[device] = new Kernel(*fftconvolveProgs[device], "pack_data");
                paKernel[device] = new Kernel(*fftconvolveProgs[device], "pad_array");
            });

        Param sig_tmp, filter_tmp;
        calcParamSizes(sig_tmp, filter_tmp, packed, sig, filter, baseDim, kind);

        int sig_packed_elem = sig_tmp.info.strides[3] * sig_tmp.info.dims[3];
        int filter_packed_elem = filter_tmp.info.strides[3] * filter_tmp.info.dims[3];

        // Number of packed complex elements in dimension 0
        int sig_half_d0 = divup(sig.info.dims[0], 2);
        int sig_half_d0_odd = sig.info.dims[0] % 2;

        int blocks = divup(sig_packed_elem, THREADS);

        // Locate features kernel sizes
        NDRange local(THREADS);
        NDRange global(blocks * THREADS);

        // Pack signal in a complex matrix where first dimension is half the input
        // (allows faster FFT computation) and pad array to a power of 2 with 0s
        auto pdOp = make_kernel<Buffer, KParam,
                                Buffer, KParam,
                                const int, const int> (*pdKernel[device]);

        pdOp(EnqueueArgs(getQueue(), global, local),
             *sig_tmp.data, sig_tmp.info, *sig.data, sig.info,
             sig_half_d0, sig_half_d0_odd);
        CL_DEBUG_FINISH(getQueue());

        blocks = divup(filter_packed_elem, THREADS);
        global = NDRange(blocks * THREADS);

        // Pad filter array with 0s
        auto paOp = make_kernel<Buffer, KParam,
                                Buffer, KParam> (*paKernel[device]);

        paOp(EnqueueArgs(getQueue(), global, local),
             *filter_tmp.data, filter_tmp.info,
             *filter.data, filter.info);
        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Exemple #26
0
void morph(Param         out,
        const Param      in,
        const Param      mask)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> morProgs;
        static std::map<int, Kernel*> morKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                ToNumStr<T> toNumStr;
                T init = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init();
                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D isDilation="<< isDilation
                        << " -D init=" << toNumStr(init)
                        << " -D windLen=" << windLen;
                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                Program prog;
                buildProgram(prog, morph_cl, morph_cl_len, options.str());
                morProgs[device]   = new Program(prog);
                morKernels[device] = new Kernel(*morProgs[device], "morph");
            });

        auto morphOp = KernelFunctor<Buffer, KParam,
                                   Buffer, KParam,
                                   Buffer, cl::LocalSpaceArg,
                                   int, int
                                  >(*morKernels[device]);

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(in.info.dims[0], THREADS_X);
        int blk_y = divup(in.info.dims[1], THREADS_Y);
        // launch batch * blk_x blocks along x dimension
        NDRange global(blk_x * THREADS_X * in.info.dims[2],
                       blk_y * THREADS_Y * in.info.dims[3]);

        // copy mask/filter to constant memory
        cl_int se_size   = sizeof(T)*windLen*windLen;
        cl::Buffer *mBuff = bufferAlloc(se_size);
        getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size);

        // calculate shared memory size
        const int halo    = windLen/2;
        const int padding = 2*halo;
        const int locLen  = THREADS_X + padding + 1;
        const int locSize = locLen * (THREADS_Y+padding);

        morphOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info, *in.data, in.info, *mBuff,
                cl::Local(locSize*sizeof(T)), blk_x, blk_y);

        bufferFree(mBuff);

        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Exemple #27
0
void orb(unsigned* out_feat,
         Param& x_out,
         Param& y_out,
         Param& score_out,
         Param& ori_out,
         Param& size_out,
         Param& desc_out,
         Param image,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels,
         const bool blur_img)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static Program            orbProgs[DeviceManager::MAX_DEVICES];
        static Kernel             hrKernel[DeviceManager::MAX_DEVICES];
        static Kernel             kfKernel[DeviceManager::MAX_DEVICES];
        static Kernel             caKernel[DeviceManager::MAX_DEVICES];
        static Kernel             eoKernel[DeviceManager::MAX_DEVICES];

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D BLOCK_SIZE=" << ORB_THREADS_X;

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                buildProgram(orbProgs[device],
                             orb_cl,
                             orb_cl_len,
                             options.str());

                hrKernel[device] = Kernel(orbProgs[device], "harris_response");
                kfKernel[device] = Kernel(orbProgs[device], "keep_features");
                caKernel[device] = Kernel(orbProgs[device], "centroid_angle");
                eoKernel[device] = Kernel(orbProgs[device], "extract_orb");
            });

        unsigned patch_size = REF_PAT_SIZE;

        unsigned min_side = std::min(image.info.dims[0], image.info.dims[1]);
        unsigned max_levels = 0;
        float scl_sum = 0.f;
        for (unsigned i = 0; i < levels; i++) {
            min_side /= scl_fctr;

            // Minimum image side for a descriptor to be computed
            if (min_side < patch_size || max_levels == levels) break;

            max_levels++;
            scl_sum += 1.f / (float)pow(scl_fctr,(float)i);
        }

        std::vector<cl::Buffer*> d_x_pyr(max_levels);
        std::vector<cl::Buffer*> d_y_pyr(max_levels);
        std::vector<cl::Buffer*> d_score_pyr(max_levels);
        std::vector<cl::Buffer*> d_ori_pyr(max_levels);
        std::vector<cl::Buffer*> d_size_pyr(max_levels);
        std::vector<cl::Buffer*> d_desc_pyr(max_levels);

        std::vector<unsigned> feat_pyr(max_levels);
        unsigned total_feat = 0;

        // Compute number of features to keep for each level
        std::vector<unsigned> lvl_best(max_levels);
        unsigned feat_sum = 0;
        for (unsigned i = 0; i < max_levels-1; i++) {
            float lvl_scl = (float)pow(scl_fctr,(float)i);
            lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl);
            feat_sum += lvl_best[i];
        }
        lvl_best[max_levels-1] = max_feat - feat_sum;

        // Maintain a reference to previous level image
        Param prev_img;
        Param lvl_img;

        const unsigned gauss_len = 9;
        T* h_gauss = nullptr;
        Param gauss_filter;
        gauss_filter.data = nullptr;

        for (unsigned i = 0; i < max_levels; i++) {
            const float lvl_scl = (float)pow(scl_fctr,(float)i);

            if (i == 0) {
                // First level is used in its original size
                lvl_img = image;

                prev_img = image;
            }
            else if (i > 0) {
                // Resize previous level image to current level dimensions
                lvl_img.info.dims[0] = round(image.info.dims[0] / lvl_scl);
                lvl_img.info.dims[1] = round(image.info.dims[1] / lvl_scl);

                lvl_img.info.strides[0] = 1;
                lvl_img.info.strides[1] = lvl_img.info.dims[0];

                for (int k = 2; k < 4; k++) {
                    lvl_img.info.dims[k] = 1;
                    lvl_img.info.strides[k] = lvl_img.info.dims[k - 1] * lvl_img.info.strides[k - 1];
                }

                lvl_img.info.offset = 0;
                lvl_img.data = bufferAlloc(lvl_img.info.dims[3] * lvl_img.info.strides[3] * sizeof(T));

                resize<T, AF_INTERP_BILINEAR>(lvl_img, prev_img);

                if (i > 1)
                   bufferFree(prev_img.data);
                prev_img = lvl_img;
            }

            unsigned lvl_feat = 0;
            Param d_x_feat, d_y_feat, d_score_feat;

            // Round feature size to nearest odd integer
            float size = 2.f * floor(patch_size / 2.f) + 1.f;

            // Avoid keeping features that might be too wide and might not fit on
            // the image, sqrt(2.f) is the radius when angle is 45 degrees and
            // represents widest case possible
            unsigned edge = ceil(size * sqrt(2.f) / 2.f);

            // Detect FAST features
            fast<T, 9, true>(&lvl_feat, d_x_feat, d_y_feat, d_score_feat,
                             lvl_img, fast_thr, 0.15f, edge);

            if (lvl_feat == 0) {
                feat_pyr[i] = 0;

                if (i > 0 && i == max_levels-1)
                    bufferFree(lvl_img.data);

                continue;
            }

            bufferFree(d_score_feat.data);

            unsigned usable_feat = 0;
            cl::Buffer* d_usable_feat = bufferAlloc(sizeof(unsigned));
            getQueue().enqueueWriteBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat);

            cl::Buffer* d_x_harris = bufferAlloc(lvl_feat * sizeof(float));
            cl::Buffer* d_y_harris = bufferAlloc(lvl_feat * sizeof(float));
            cl::Buffer* d_score_harris = bufferAlloc(lvl_feat * sizeof(float));

            // Calculate Harris responses
            // Good block_size >= 7 (must be an odd number)
            const dim_type blk_x = divup(lvl_feat, ORB_THREADS_X);
            const NDRange local(ORB_THREADS_X, ORB_THREADS_Y);
            const NDRange global(blk_x * ORB_THREADS_X, ORB_THREADS_Y);

            unsigned block_size = 7;
            float k_thr = 0.04f;

            auto hrOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, const unsigned,
                                    Buffer, Buffer, KParam,
                                    const unsigned, const float, const unsigned> (hrKernel[device]);

            hrOp(EnqueueArgs(getQueue(), global, local),
                 *d_x_harris, *d_y_harris, *d_score_harris,
                 *d_x_feat.data, *d_y_feat.data, lvl_feat,
                 *d_usable_feat, *lvl_img.data, lvl_img.info,
                 block_size, k_thr, patch_size);
            CL_DEBUG_FINISH(getQueue());

            getQueue().enqueueReadBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat);

            bufferFree(d_x_feat.data);
            bufferFree(d_y_feat.data);
            bufferFree(d_usable_feat);

            if (usable_feat == 0) {
                feat_pyr[i] = 0;

                bufferFree(d_x_harris);
                bufferFree(d_y_harris);
                bufferFree(d_score_harris);

                if (i > 0 && i == max_levels-1)
                    bufferFree(lvl_img.data);

                continue;
            }

            // Sort features according to Harris responses
            Param d_harris_sorted;
            Param d_harris_idx;

            d_harris_sorted.info.dims[0] = usable_feat;
            d_harris_idx.info.dims[0] = usable_feat;
            d_harris_sorted.info.strides[0] = 1;
            d_harris_idx.info.strides[0] = 1;

            for (int k = 1; k < 4; k++) {
                d_harris_sorted.info.dims[k] = 1;
                d_harris_idx.info.dims[k] = 1;
                d_harris_sorted.info.strides[k] = d_harris_sorted.info.dims[k - 1] * d_harris_sorted.info.strides[k - 1];
                d_harris_idx.info.strides[k] = d_harris_idx.info.dims[k - 1] * d_harris_idx.info.strides[k - 1];
            }

            d_harris_sorted.info.offset = 0;
            d_harris_idx.info.offset = 0;
            d_harris_sorted.data = d_score_harris;
            d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned));

            sort0_index<float, false>(d_harris_sorted, d_harris_idx);

            cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_score_lvl = bufferAlloc(usable_feat * sizeof(float));

            usable_feat = min(usable_feat, lvl_best[i]);

            // Keep only features with higher Harris responses
            const dim_type keep_blk = divup(usable_feat, ORB_THREADS);
            const NDRange local_keep(ORB_THREADS, 1);
            const NDRange global_keep(keep_blk * ORB_THREADS, 1);

            auto kfOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, Buffer, Buffer,
                                    const unsigned> (kfKernel[device]);

            kfOp(EnqueueArgs(getQueue(), global_keep, local_keep),
                 *d_x_lvl, *d_y_lvl, *d_score_lvl,
                 *d_x_harris, *d_y_harris, *d_harris_sorted.data, *d_harris_idx.data,
                 usable_feat);
            CL_DEBUG_FINISH(getQueue());

            bufferFree(d_x_harris);
            bufferFree(d_y_harris);
            bufferFree(d_harris_sorted.data);
            bufferFree(d_harris_idx.data);

            cl::Buffer* d_ori_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_size_lvl = bufferAlloc(usable_feat * sizeof(float));

            // Compute orientation of features
            const dim_type centroid_blk_x = divup(usable_feat, ORB_THREADS_X);
            const NDRange local_centroid(ORB_THREADS_X, ORB_THREADS_Y);
            const NDRange global_centroid(centroid_blk_x * ORB_THREADS_X, ORB_THREADS_Y);

            auto caOp = make_kernel<Buffer, Buffer, Buffer,
                                    const unsigned, Buffer, KParam,
                                    const unsigned> (caKernel[device]);

            caOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                 *d_x_lvl, *d_y_lvl, *d_ori_lvl,
                 usable_feat, *lvl_img.data, lvl_img.info,
                 patch_size);
            CL_DEBUG_FINISH(getQueue());

            Param lvl_filt;
            Param lvl_tmp;

            if (blur_img) {
                lvl_filt = lvl_img;
                lvl_tmp = lvl_img;

                lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * lvl_filt.info.dims[1] * sizeof(T));
                lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * lvl_tmp.info.dims[1] * sizeof(T));

                // Calculate a separable Gaussian kernel
                if (h_gauss == nullptr) {
                    h_gauss = new T[gauss_len];
                    gaussian1D(h_gauss, gauss_len, 2.f);
                    gauss_filter.info.dims[0] = gauss_len;
                    gauss_filter.info.strides[0] = 1;

                    for (int k = 1; k < 4; k++) {
                        gauss_filter.info.dims[k] = 1;
                        gauss_filter.info.strides[k] = gauss_filter.info.dims[k - 1] * gauss_filter.info.strides[k - 1];
                    }

                    dim_type gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3];
                    gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T));
                    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
                }

                // Filter level image with Gaussian kernel to reduce noise sensitivity
                convolve2<T, convAccT, 0, false, gauss_len>(lvl_tmp, lvl_img, gauss_filter);
                convolve2<T, convAccT, 1, false, gauss_len>(lvl_filt, lvl_tmp, gauss_filter);

                bufferFree(lvl_tmp.data);
            }

            // Compute ORB descriptors
            cl::Buffer* d_desc_lvl = bufferAlloc(usable_feat * 8 * sizeof(unsigned));
            unsigned* h_desc_lvl = new unsigned[usable_feat * 8];
            for (int j = 0; j < (int)usable_feat * 8; j++)
                h_desc_lvl[j] = 0;
            getQueue().enqueueWriteBuffer(*d_desc_lvl, CL_TRUE, 0, usable_feat * 8 * sizeof(unsigned), h_desc_lvl);
            delete[] h_desc_lvl;

            auto eoOp = make_kernel<Buffer, const unsigned,
                                    Buffer, Buffer, Buffer, Buffer,
                                    Buffer, KParam,
                                    const float, const unsigned> (eoKernel[device]);

            if (blur_img) {
                eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                     *d_desc_lvl, usable_feat,
                     *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl,
                     *lvl_filt.data, lvl_filt.info,
                     lvl_scl, patch_size);
                CL_DEBUG_FINISH(getQueue());

                bufferFree(lvl_filt.data);
            }
            else {
                eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                     *d_desc_lvl, usable_feat,
                     *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl,
                     *lvl_img.data, lvl_img.info,
                     lvl_scl, patch_size);
                CL_DEBUG_FINISH(getQueue());
            }

            // Store results to pyramids
            total_feat += usable_feat;
            feat_pyr[i] = usable_feat;
            d_x_pyr[i] = d_x_lvl;
            d_y_pyr[i] = d_y_lvl;
            d_score_pyr[i] = d_score_lvl;
            d_ori_pyr[i] = d_ori_lvl;
            d_size_pyr[i] = d_size_lvl;
            d_desc_pyr[i] = d_desc_lvl;

            if (i > 0 && i == max_levels-1)
                bufferFree(lvl_img.data);
        }

        if (gauss_filter.data != nullptr)
            bufferFree(gauss_filter.data);
        if (h_gauss != nullptr)
            delete[] h_gauss;

        // If no features are found, set found features to 0 and return
        if (total_feat == 0) {
            *out_feat = 0;
            return;
        }

        // Allocate output memory
        x_out.info.dims[0] = total_feat;
        x_out.info.strides[0] = 1;
        y_out.info.dims[0] = total_feat;
        y_out.info.strides[0] = 1;
        score_out.info.dims[0] = total_feat;
        score_out.info.strides[0] = 1;
        ori_out.info.dims[0] = total_feat;
        ori_out.info.strides[0] = 1;
        size_out.info.dims[0] = total_feat;
        size_out.info.strides[0] = 1;

        desc_out.info.dims[0] = 8;
        desc_out.info.strides[0] = 1;
        desc_out.info.dims[1] = total_feat;
        desc_out.info.strides[1] = desc_out.info.dims[0];

        for (int k = 1; k < 4; k++) {
            x_out.info.dims[k] = 1;
            x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
            y_out.info.dims[k] = 1;
            y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
            score_out.info.dims[k] = 1;
            score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1];
            ori_out.info.dims[k] = 1;
            ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1];
            size_out.info.dims[k] = 1;
            size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1];
            if (k > 1) {
                desc_out.info.dims[k] = 1;
                desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1];
            }
        }

        if (total_feat > 0) {
            size_t out_sz  = total_feat * sizeof(float);
            x_out.data     = bufferAlloc(out_sz);
            y_out.data     = bufferAlloc(out_sz);
            score_out.data = bufferAlloc(out_sz);
            ori_out.data   = bufferAlloc(out_sz);
            size_out.data  = bufferAlloc(out_sz);

            size_t desc_sz = total_feat * 8 * sizeof(unsigned);
            desc_out.data  = bufferAlloc(desc_sz);
        }

        unsigned offset = 0;
        for (unsigned i = 0; i < max_levels; i++) {
            if (feat_pyr[i] == 0)
                continue;

            if (i > 0)
                offset += feat_pyr[i-1];

            getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));

            getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned));

            bufferFree(d_x_pyr[i]);
            bufferFree(d_y_pyr[i]);
            bufferFree(d_score_pyr[i]);
            bufferFree(d_ori_pyr[i]);
            bufferFree(d_size_pyr[i]);
            bufferFree(d_desc_pyr[i]);
        }

        // Sets number of output features
        *out_feat = total_feat;
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Exemple #28
0
void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, const Param filter)
{
    try {
        int f0 = filter.info.dims[0];
        int f1 = filter.info.dims[1];

        std::string ref_name =
            std::string("conv2_") +
            std::string(dtype_traits<T>::getName()) +
            std::string("_") +
            std::string(dtype_traits<aT>::getName()) +
            std::string("_") +
            std::to_string(expand) +
            std::string("_") +
            std::to_string(f0) +
            std::string("_") +
            std::to_string(f1);

        int device = getActiveDeviceId();
        kc_t::iterator idx = kernelCaches[device].find(ref_name);

        kc_entry_t entry;
        if (idx == kernelCaches[device].end()) {
            size_t LOC_SIZE = (THREADS_X+2*(f0-1))*(THREADS_Y+2*(f1-1));

            std::ostringstream options;
            options << " -D T=" << dtype_traits<T>::getName()
                    << " -D accType="<< dtype_traits<aT>::getName()
                    << " -D BASE_DIM="<< 2 /* hard constant specific to this convolution type */
                    << " -D FLEN0=" << f0
                    << " -D FLEN1=" << f1
                    << " -D EXPAND="<< expand
                    << " -D C_SIZE="<< LOC_SIZE;
            if (std::is_same<T, double>::value ||
                std::is_same<T, cdouble>::value) {
                options << " -D USE_DOUBLE";
            }
            Program prog;
            buildProgram(prog, convolve_cl, convolve_cl_len, options.str());
            entry.prog   = new Program(prog);
            entry.ker = new Kernel(*entry.prog, "convolve");

            kernelCaches[device][ref_name] = entry;
        } else {
            entry = idx->second;
        }

        auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam,
                                  Buffer, KParam, int, int,
                                  int, int,
                                  int, int
                                 >(*entry.ker);

        convOp(EnqueueArgs(getQueue(), param.global, param.local),
                *out.data, out.info, *signal.data, signal.info,
                *param.impulse, filter.info, param.nBBS0, param.nBBS1,
                param.o[1], param.o[2], param.s[1], param.s[2]);

    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}