Exemplo n.º 1
0
    static Kernel get_scan_dim_kernels(int kerIdx, int dim, bool isFinalPass, uint threads_y)
    {
        std::string ref_name =
            std::string("scan_") +
            std::to_string(dim) +
            std::string("_") +
            std::to_string(isFinalPass) +
            std::string("_") +
            std::string(dtype_traits<Ti>::getName()) +
            std::string("_") +
            std::string(dtype_traits<To>::getName()) +
            std::string("_") +
            std::to_string(op) +
            std::string("_") +
            std::to_string(threads_y) +
            std::string("_") +
            std::to_string(int(inclusive_scan));

        int device = getActiveDeviceId();

        kc_entry_t entry = kernelCache(device, ref_name);

        if (entry.prog==0 && entry.ker==0) {
            Binary<To, op> scan;
            ToNumStr<To> toNumStr;

            std::ostringstream options;
            options << " -D To=" << dtype_traits<To>::getName()
                    << " -D Ti=" << dtype_traits<Ti>::getName()
                    << " -D T=To"
                    << " -D dim=" << dim
                    << " -D DIMY=" << threads_y
                    << " -D THREADS_X=" << THREADS_X
                    << " -D init=" << toNumStr(scan.init())
                    << " -D " << binOpName<op>()
                    << " -D CPLX=" << af::iscplx<Ti>()
                    << " -D isFinalPass="******" -D inclusive_scan=" << inclusive_scan;
            if (std::is_same<Ti, double>::value ||
                std::is_same<Ti, cdouble>::value) {
                options << " -D USE_DOUBLE";
            }

            const char *ker_strs[] = {ops_cl, scan_dim_cl};
            const int   ker_lens[] = {ops_cl_len, scan_dim_cl_len};
            cl::Program prog;
            buildProgram(prog, 2, ker_strs, ker_lens, options.str());

            entry.prog = new Program(prog);
            entry.ker = new Kernel[2];

            entry.ker[0] = Kernel(*entry.prog, "scan_dim_kernel");
            entry.ker[1] = Kernel(*entry.prog, "bcast_dim_kernel");


            addKernelToCache(device, ref_name, entry);
        }

        return entry.ker[kerIdx];
    }
        void sparseArithOpCSR(Param out, const Param values, const Param rowIdx, const Param colIdx,
                const Param rhs, const bool reverse)
        {
            std::string ref_name =
                std::string("sparseArithOpCSR_") +
                getOpString<op>() + std::string("_") +
                std::string(dtype_traits<T>::getName());

            int device = getActiveDeviceId();
            kc_entry_t entry = kernelCache(device, ref_name);

            if (entry.prog==0 && entry.ker==0) {

                std::ostringstream options;
                options << " -D T="  << dtype_traits<T>::getName();
                options << " -D OP=" << getOpString<op>();

                if((af_dtype) dtype_traits<T>::af_type == c32 ||
                        (af_dtype) dtype_traits<T>::af_type == c64) {
                    options << " -D IS_CPLX=1";
                } else {
                    options << " -D IS_CPLX=0";
                }
                if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                const char *ker_strs[] = {sparse_arith_common_cl    , sparse_arith_csr_cl};
                const int   ker_lens[] = {sparse_arith_common_cl_len, sparse_arith_csr_cl_len};

                Program prog;
                buildProgram(prog, 2, ker_strs, ker_lens, options.str());
                entry.prog = new Program(prog);
                entry.ker  = new Kernel(*entry.prog, "sparse_arith_csr_kernel");

                addKernelToCache(device, ref_name, entry);
            }

            auto sparseArithCSROp = KernelFunctor<Buffer, const KParam,
                 const Buffer, const Buffer, const Buffer,
                 const int,
                 const Buffer, const KParam,
                 const int>(*entry.ker);

            NDRange local(TX, TY, 1);
            NDRange global(divup(out.info.dims[0], TY) * TX, TY, 1);

            sparseArithCSROp(EnqueueArgs(getQueue(), global, local),
                    *out.data, out.info,
                    *values.data, *rowIdx.data, *colIdx.data, values.info.dims[0],
                    *rhs.data, rhs.info, reverse);

            CL_DEBUG_FINISH(getQueue());
        }
Exemplo n.º 3
0
void morph3d(Param       out,
        const Param      in,
        const Param      mask)
{
    std::string refName = std::string("morph3d_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(isDilation) + std::to_string(SeLength);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::string options = generateOptionsString<T, isDilation, SeLength>();
        const char* ker_strs[] = {morph_cl};
        const int   ker_lens[] = {morph_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options);
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "morph3d");
        addKernelToCache(device, refName, entry);
    }

    auto morphOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer,
                                  cl::LocalSpaceArg, int >(*entry.ker);

    NDRange local(CUBE_X, CUBE_Y, CUBE_Z);

    int blk_x = divup(in.info.dims[0], CUBE_X);
    int blk_y = divup(in.info.dims[1], CUBE_Y);
    int blk_z = divup(in.info.dims[2], CUBE_Z);
    // launch batch * blk_x blocks along x dimension
    NDRange global(blk_x * CUBE_X * in.info.dims[3], blk_y * CUBE_Y, blk_z * CUBE_Z);

    // copy mask/filter to constant memory
    cl_int se_size   = sizeof(T)*SeLength*SeLength*SeLength;
    cl::Buffer *mBuff = bufferAlloc(se_size);
    getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size);

    // calculate shared memory size
    const int padding = (SeLength%2==0 ? (SeLength-1) : (2*(SeLength/2)));
    const int locLen  = CUBE_X+padding+1;
    const int locArea = locLen *(CUBE_Y+padding);
    const int locSize = locArea*(CUBE_Z+padding);

    morphOp(EnqueueArgs(getQueue(), global, local),
            *out.data, out.info, *in.data, in.info,
            *mBuff, cl::Local(locSize*sizeof(T)), blk_x);

    bufferFree(mBuff);
    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 4
0
unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
                    const unsigned idim0, const unsigned idim1,
                    const cl::Buffer* resp_in, const unsigned edge,
                    const unsigned max_corners) {
    unsigned corners_found = 0;

    std::string refName =
        std::string("non_maximal_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D NONMAX";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {susan_cl};
        const int ker_lens[]   = {susan_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "non_maximal");

        addKernelToCache(device, refName, entry);
    }

    cl::Buffer* d_corners_found = bufferAlloc(sizeof(unsigned));
    getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0,
                                  sizeof(unsigned), &corners_found);

    auto nonMaximalOp =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, unsigned, unsigned,
                      Buffer, unsigned, unsigned>(*entry.ker);

    NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
    NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0],
                   divup(idim1 - 2 * edge, local[1]) * local[1]);

    nonMaximalOp(EnqueueArgs(getQueue(), global, local), *x_out, *y_out,
                 *resp_out, *d_corners_found, idim0, idim1, *resp_in, edge,
                 max_corners);

    getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned),
                                 &corners_found);
    bufferFree(d_corners_found);

    return corners_found;
}
Exemplo n.º 5
0
void diff(Param out, const Param in, const unsigned indims)
{
    std::string refName = std::string("diff_kernel_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(dim) +
        std::to_string(isDiff2);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T="        << dtype_traits<T>::getName()
                << " -D DIM="      << dim
                << " -D isDiff2=" << isDiff2;
        if (std::is_same<T, double>::value ||
            std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {diff_cl};
        const int   ker_lens[] = {diff_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "diff_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto diffOp = KernelFunctor< Buffer, const Buffer, const KParam, const KParam,
                                 const int, const int, const int> (*entry.ker);

    NDRange local(TX, TY, 1);
    if(dim == 0 && indims == 1) {
        local = NDRange(TX * TY, 1, 1);
    }

    int blocksPerMatX = divup(out.info.dims[0], local[0]);
    int blocksPerMatY = divup(out.info.dims[1], local[1]);
    NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                   local[1] * blocksPerMatY * out.info.dims[3], 1);

    const int oElem = out.info.dims[0] * out.info.dims[1] * out.info.dims[2] * out.info.dims[3];

    diffOp(EnqueueArgs(getQueue(), global, local),
           *out.data, *in.data, out.info, in.info, oElem, blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 6
0
void medfilt2(Param out, const Param in) {
    std::string refName =
        std::string("medfilt2_") + std::string(dtype_traits<T>::getName()) +
        std::to_string(pad) + std::to_string(w_len) + std::to_string(w_wid);

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        const int ARR_SIZE = w_len * (w_wid - w_wid / 2);

        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D pad=" << pad
                << " -D AF_PAD_ZERO=" << AF_PAD_ZERO
                << " -D AF_PAD_SYM=" << AF_PAD_SYM
                << " -D ARR_SIZE=" << ARR_SIZE << " -D w_len=" << w_len
                << " -D w_wid=" << w_wid;
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {medfilt2_cl};
        const int ker_lens[]   = {medfilt2_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "medfilt2");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(THREADS_X, THREADS_Y);

    int blk_x = divup(in.info.dims[0], THREADS_X);
    int blk_y = divup(in.info.dims[1], THREADS_Y);

    NDRange global(blk_x * in.info.dims[2] * THREADS_X,
                   blk_y * in.info.dims[3] * THREADS_Y);

    auto medfiltOp = KernelFunctor<Buffer, KParam, Buffer, KParam,
                                   cl::LocalSpaceArg, int, int>(*entry.ker);

    size_t loc_size =
        (THREADS_X + w_len - 1) * (THREADS_Y + w_wid - 1) * sizeof(T);

    medfiltOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info,
              *in.data, in.info, cl::Local(loc_size), blk_x, blk_y);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 7
0
void join(Param out, const Param in, const af::dim4 offset)
{
    std::string refName = std::string("join_kernel_") +
        std::string(dtype_traits<To>::getName()) +
        std::string(dtype_traits<Ti>::getName()) +
        std::to_string(dim);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D To=" << dtype_traits<To>::getName()
                << " -D Ti=" << dtype_traits<Ti>::getName()
                << " -D dim=" << dim;

        if (std::is_same<To, double>::value || std::is_same<To, cdouble>::value) {
            options << " -D USE_DOUBLE";
        } else if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {join_cl};
        const int   ker_lens[] = {join_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "join_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto joinOp = KernelFunctor<Buffer, const KParam, const Buffer, const KParam,
                                const int, const int, const int, const int,
                                const int, const int> (*entry.ker);

    NDRange local(TX, TY, 1);

    int blocksPerMatX = divup(in.info.dims[0], TILEX);
    int blocksPerMatY = divup(in.info.dims[1], TILEY);
    NDRange global(local[0] * blocksPerMatX * in.info.dims[2],
                   local[1] * blocksPerMatY * in.info.dims[3], 1);

    joinOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info,
           offset[0], offset[1], offset[2], offset[3], blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter)
{
    std::string ref_name = std::string("convolveND_") +
        std::string(dtype_traits<T>::getName()) + std::string(dtype_traits<aT>::getName()) +
        std::to_string(bDim) + std::to_string(expand);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T="         << dtype_traits<T>::getName()
                << " -D Ti="        << dtype_traits<T>::getName()
                << " -D To="        << dtype_traits<aT>::getName()
                << " -D accType="   << dtype_traits<aT>::getName()
                << " -D BASE_DIM="  << bDim
                << " -D EXPAND="    << expand
                << " -D "           << binOpName<af_mul_t>();

        if((af_dtype) dtype_traits<T>::af_type == c32 ||
            (af_dtype) dtype_traits<T>::af_type == c64) {
            options << " -D CPLX=1";
        } else {
            options << " -D CPLX=0";
        }
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char *ker_strs[] = {ops_cl, convolve_cl};
        const int   ker_lens[] = {ops_cl_len, convolve_cl_len};
        Program prog;
        buildProgram(prog, 2, ker_strs, ker_lens, options.str());

        entry.prog   = new Program(prog);
        entry.ker = new Kernel(*entry.prog, "convolve");

        addKernelToCache(device, ref_name, entry);
    }

    auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, Buffer, KParam,
                                    int, int, int, int, int, int, int, int >(*entry.ker);

    convOp(EnqueueArgs(getQueue(), param.global, param.local),
           *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size),
           *param.impulse, filter.info, param.nBBS0, param.nBBS1,
           param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]);
}
Exemplo n.º 9
0
static void get_out_idx(Buffer *out_data,
                        Param &otmp, Param &rtmp,
                        Param &in, uint threads_x,
                        uint groups_x, uint groups_y)
{
    std::string refName = std::string("get_out_idx_kernel_") + std::string(dtype_traits<T>::getName());

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        ToNumStr<T> toNumStr;
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D zero=" << toNumStr(scalar<T>(0))
                << " -D CPLX=" << af::iscplx<T>();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {where_cl};
        const int   ker_lens[] = {where_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "get_out_idx_kernel");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
    NDRange global(local[0] * groups_x * in.info.dims[2], local[1] * groups_y * in.info.dims[3]);

    uint lim = divup(otmp.info.dims[0], (threads_x * groups_x));

    auto whereOp = KernelFunctor< Buffer, Buffer, KParam, Buffer, KParam,
                                  Buffer, KParam, uint, uint, uint>(*entry.ker);

    whereOp(EnqueueArgs(getQueue(), global, local),
            *out_data, *otmp.data, otmp.info,
            *rtmp.data, rtmp.info, *in.data, in.info,
            groups_x, groups_y, lim);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 10
0
void laset(int m, int  n,
           T offdiag, T diag,
           cl_mem dA, size_t dA_offset, magma_int_t ldda)
{
    std::string refName = laset_name<uplo>() + std::string("_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(uplo);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D BLK_X=" << BLK_X
                << " -D BLK_Y=" << BLK_Y
                << " -D IS_CPLX=" << af::iscplx<T>();

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {laset_cl};
        const int   ker_lens[] = {laset_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, laset_name<uplo>());

        addKernelToCache(device, refName, entry);
    }

    int groups_x = (m - 1) / BLK_X + 1;
    int groups_y = (n - 1) / BLK_Y + 1;

    NDRange local(BLK_X, 1);
    NDRange global(groups_x * local[0], groups_y * local[1]);

    // retain the cl_mem object during cl::Buffer creation
    cl::Buffer dAObj(dA, true);

    auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker);

    lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda);
}
Exemplo n.º 11
0
void triangle(Param out, const Param in)
{
    std::string refName = std::string("triangle_kernel_") + std::string(dtype_traits<T>::getName()) +
        std::to_string(is_upper) + std::to_string(is_unit_diag);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D is_upper=" << is_upper
                << " -D is_unit_diag=" << is_unit_diag
                << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")"
                << " -D ONE=(T)(" << scalar_to_option(scalar<T>(1)) << ")";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {triangle_cl};
        const int   ker_lens[] = {triangle_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "triangle_kernel");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(TX, TY);

    int groups_x = divup(out.info.dims[0], TILEX);
    int groups_y = divup(out.info.dims[1], TILEY);

    NDRange global(groups_x * out.info.dims[2] * local[0], groups_y * out.info.dims[3] * local[1]);

    auto triangleOp = KernelFunctor< Buffer, KParam, const Buffer, KParam,
                                     const int, const int >(*entry.ker);

    triangleOp(EnqueueArgs(getQueue(), global, local),
               *out.data, out.info, *in.data, in.info, groups_x, groups_y);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 12
0
void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims)
{
    std::string refName = std::string("iota_kernel_") + std::string(dtype_traits<T>::getName());

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;

        options << " -D T=" << dtype_traits<T>::getName();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {iota_cl};
        const int   ker_lens[] = {iota_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "iota_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto iotaOp = KernelFunctor<Buffer, const KParam,
                                const int, const int, const int, const int,
                                const int, const int, const int, const int,
                                const int, const int> (*entry.ker);

    NDRange local(IOTA_TX, IOTA_TY, 1);

    int blocksPerMatX = divup(out.info.dims[0], TILEX);
    int blocksPerMatY = divup(out.info.dims[1], TILEY);
    NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                   local[1] * blocksPerMatY * out.info.dims[3], 1);

    iotaOp(EnqueueArgs(getQueue(), global, local),
           *out.data, out.info, sdims[0], sdims[1], sdims[2], sdims[3],
           tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 13
0
void susan(cl::Buffer* out, const cl::Buffer* in, const unsigned in_off,
           const unsigned idim0, const unsigned idim1, const float t,
           const float g, const unsigned edge) {
    std::string refName = std::string("susan_responses_") +
                          std::string(dtype_traits<T>::getName()) +
                          std::to_string(radius);

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        const size_t LOCAL_MEM_SIZE =
            (SUSAN_THREADS_X + 2 * radius) * (SUSAN_THREADS_Y + 2 * radius);
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE
                << " -D BLOCK_X=" << SUSAN_THREADS_X
                << " -D BLOCK_Y=" << SUSAN_THREADS_Y << " -D RADIUS=" << radius
                << " -D RESPONSE";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {susan_cl};
        const int ker_lens[]   = {susan_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "susan_responses");

        addKernelToCache(device, refName, entry);
    }

    auto susanOp = KernelFunctor<Buffer, Buffer, unsigned, unsigned, unsigned,
                                 float, float, unsigned>(*entry.ker);

    NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
    NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0],
                   divup(idim1 - 2 * edge, local[1]) * local[1]);

    susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, in_off, idim0,
            idim1, t, g, edge);
}
Exemplo n.º 14
0
void hsv2rgb_convert(Param out, const Param in)
{
    std::string refName = std::string("hsvrgb_convert_") +
        std::string(dtype_traits<T>::getName()) + std::to_string(isHSV2RGB);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();

        if(isHSV2RGB) options << " -D isHSV2RGB";
        if (std::is_same<T, double>::value) options << " -D USE_DOUBLE";

        const char* ker_strs[] = {hsv_rgb_cl};
        const int   ker_lens[] = {hsv_rgb_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "convert");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(THREADS_X, THREADS_Y);

    int blk_x = divup(in.info.dims[0], THREADS_X);
    int blk_y = divup(in.info.dims[1], THREADS_Y);

    // all images are three channels, so batch
    // parameter would be along 4th dimension
    NDRange global(blk_x * in.info.dims[3] * THREADS_X, blk_y * THREADS_Y);

    auto hsvrgbOp = KernelFunctor<Buffer, KParam, Buffer, KParam, int> (*entry.ker);

    hsvrgbOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, blk_x);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 15
0
static void identity(Param out) {
    std::string refName = std::string("identity_kernel") +
                          std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D ONE=(T)("
                << scalar_to_option(scalar<T>(1)) << ")"
                << " -D ZERO=(T)(" << scalar_to_option(scalar<T>(0)) << ")";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {identity_cl};
        const int ker_lens[]   = {identity_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "identity_kernel");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(32, 8);
    int groups_x = divup(out.info.dims[0], local[0]);
    int groups_y = divup(out.info.dims[1], local[1]);
    NDRange global(groups_x * out.info.dims[2] * local[0],
                   groups_y * out.info.dims[3] * local[1]);

    auto identityOp = KernelFunctor<Buffer, const KParam, int, int>(*entry.ker);

    identityOp(EnqueueArgs(getQueue(), global, local), *(out.data), out.info,
               groups_x, groups_y);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 16
0
void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy,
            const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
            const dim_t nx, const bool is_column) {
    std::string ref_name = std::string("unwrap_") +
                           std::string(dtype_traits<T>::getName()) +
                           std::string("_") + std::to_string(is_column);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog == 0 && entry.ker == 0) {
        ToNumStr<T> toNumStr;
        std::ostringstream options;
        options << " -D is_column=" << is_column
                << " -D ZERO=" << toNumStr(scalar<T>(0))
                << " -D T=" << dtype_traits<T>::getName();

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        Program prog;
        buildProgram(prog, unwrap_cl, unwrap_cl_len, options.str());

        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "unwrap_kernel");

        addKernelToCache(device, ref_name, entry);
    }

    dim_t TX = 1, TY = 1;
    dim_t BX       = 1;
    const dim_t BY = out.info.dims[2] * out.info.dims[3];
    dim_t reps     = 1;

    if (is_column) {
        TX   = std::min(THREADS_PER_GROUP, nextpow2(out.info.dims[0]));
        TY   = THREADS_PER_GROUP / TX;
        BX   = divup(out.info.dims[1], TY);
        reps = divup((wx * wy), TX);
    } else {
        TX   = THREADS_X;
        TY   = THREADS_Y;
        BX   = divup(out.info.dims[0], TX);
        reps = divup((wx * wy), TY);
    }

    NDRange local(TX, TY);
    NDRange global(local[0] * BX, local[1] * BY);

    auto unwrapOp =
        KernelFunctor<Buffer, const KParam, const Buffer, const KParam,
                      const int, const int, const int, const int, const int,
                      const int, const int, const int>(*entry.ker);

    unwrapOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info,
             *in.data, in.info, wx, wy, sx, sy, px, py, nx, reps);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 17
0
        void csrmv(Param out,
                   const Param &values, const Param &rowIdx, const Param &colIdx,
                   const Param &rhs, const T alpha, const T beta)
        {
            bool use_alpha = (alpha != scalar<T>(1.0));
            bool use_beta = (beta != scalar<T>(0.0));

            // Using greedy indexing is causing performance issues on many platforms
            // FIXME: Figure out why
            bool use_greedy = false;

            // FIXME: Find a better number based on average non zeros per row
            int threads = 64;

            std::string ref_name =
                std::string("csrmv_") +
                std::string(dtype_traits<T>::getName()) +
                std::string("_") +
                std::to_string(use_alpha) +
                std::string("_") +
                std::to_string(use_beta) +
                std::string("_") +
                std::to_string(use_greedy) +
                std::string("_") +
                std::to_string(threads);

            int device = getActiveDeviceId();

            kc_entry_t entry = kernelCache(device, ref_name);

            if (entry.prog==0 && entry.ker==0) {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();
                options << " -D USE_ALPHA=" << use_alpha;
                options << " -D USE_BETA=" << use_beta;
                options << " -D USE_GREEDY=" << use_greedy;
                options << " -D THREADS=" << threads;

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                if (std::is_same<T, cfloat>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D IS_CPLX=1";
                } else {
                    options << " -D IS_CPLX=0";
                }

                const char *ker_strs[] = {csrmv_cl};
                const int   ker_lens[] = {csrmv_cl_len};

                Program prog;
                buildProgram(prog, 1, ker_strs, ker_lens, options.str());
                entry.prog = new Program(prog);
                entry.ker  = new Kernel[2];
                entry.ker[0] = Kernel(*entry.prog, "csrmv_thread");
                entry.ker[1] = Kernel(*entry.prog, "csrmv_block");

                addKernelToCache(device, ref_name, entry);
            }

            int count = 0;
            cl::Buffer *counter = bufferAlloc(sizeof(int));
            getQueue().enqueueWriteBuffer(*counter, CL_TRUE,
                                          0,
                                          sizeof(int),
                                          (void *)&count);

            // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block
            bool is_csrmv_block = true;
            auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0];
            auto csrmv_func = KernelFunctor<Buffer,
                                            Buffer, Buffer, Buffer,
                                            int,
                                            Buffer, KParam, T, T, Buffer>(csrmv_kernel);

            NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1);
            int M = rowIdx.info.dims[0] - 1;

            int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]);
            groups_x = std::min(groups_x, MAX_CSRMV_GROUPS);
            NDRange global(local[0] * groups_x, 1);

            csrmv_func(EnqueueArgs(getQueue(), global, local),
                        *out.data, *values.data, *rowIdx.data, *colIdx.data,
                        M, *rhs.data, rhs.info, alpha, beta, *counter);

            CL_DEBUG_FINISH(getQueue());
            bufferFree(counter);
        }
Exemplo n.º 18
0
void csrmm_nt(Param out, const Param &values, const Param &rowIdx,
              const Param &colIdx, const Param &rhs, const T alpha,
              const T beta) {
    bool use_alpha = (alpha != scalar<T>(1.0));
    bool use_beta  = (beta != scalar<T>(0.0));

    // Using greedy indexing is causing performance issues on many platforms
    // FIXME: Figure out why
    bool use_greedy = false;

    std::string ref_name = std::string("csrmm_nt_") +
                           std::string(dtype_traits<T>::getName()) +
                           std::string("_") + std::to_string(use_alpha) +
                           std::string("_") + std::to_string(use_beta) +
                           std::string("_") + std::to_string(use_greedy);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();
        options << " -D USE_ALPHA=" << use_alpha;
        options << " -D USE_BETA=" << use_beta;
        options << " -D USE_GREEDY=" << use_greedy;
        options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP;

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }
        if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) {
            options << " -D IS_CPLX=1";
        } else {
            options << " -D IS_CPLX=0";
        }

        const char *ker_strs[] = {csrmm_cl};
        const int ker_lens[]   = {csrmm_cl_len};

        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog   = new Program(prog);
        entry.ker    = new Kernel[2];
        entry.ker[0] = Kernel(*entry.prog, "csrmm_nt");
        // FIXME: Change this after adding another kernel
        entry.ker[1] = Kernel(*entry.prog, "csrmm_nt");

        addKernelToCache(device, ref_name, entry);
    }

    auto csrmm_nt_kernel = entry.ker[0];
    auto csrmm_nt_func =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam,
                      T, T, Buffer>(csrmm_nt_kernel);
    NDRange local(THREADS_PER_GROUP, 1);
    int M = rowIdx.info.dims[0] - 1;
    int N = rhs.info.dims[0];

    int groups_x = divup(N, local[0]);
    int groups_y = divup(M, REPEAT);
    groups_y     = std::min(groups_y, MAX_CSRMM_GROUPS);
    NDRange global(local[0] * groups_x, local[1] * groups_y);

    std::vector<int> count(groups_x);
    cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int));
    getQueue().enqueueWriteBuffer(
        *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data());

    csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data,
                  *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data,
                  rhs.info, alpha, beta, *counter);

    bufferFree(counter);
}
Exemplo n.º 19
0
void mean_first_launcher(Param out, Param owt,
        Param in, Param inWeight,
        const int threads_x,
        const uint groups_x,
        const uint groups_y)
{

    bool input_weight = ((inWeight.info.dims[0] *
                          inWeight.info.dims[1] *
                          inWeight.info.dims[2] *
                          inWeight.info.dims[3]) != 0);

    bool output_weight = (( owt.info.dims[0] *
                            owt.info.dims[1] *
                            owt.info.dims[2] *
                            owt.info.dims[3]) != 0);

    std::string ref_name =
        std::string("mean_0_") +
        std::string(dtype_traits<Ti>::getName()) +
        std::string("_") +
        std::string(dtype_traits<Tw>::getName()) +
        std::string("_") +
        std::string(dtype_traits<To>::getName()) +
        std::string("_") +
        std::to_string(threads_x) +
        std::string("_") +
        std::to_string(input_weight) +
        std::string("_") +
        std::to_string(output_weight);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog==0 && entry.ker==0) {

        Binary<To, af_add_t> mean;
        ToNumStr<To> toNumStr;
        ToNumStr<Tw> twNumStr;
        Transform<uint, Tw, af_add_t> transform_weight;

        std::ostringstream options;
        options << " -D Ti=" << dtype_traits<Ti>::getName()
            << " -D Tw=" << dtype_traits<Tw>::getName()
            << " -D To=" << dtype_traits<To>::getName()
            << " -D DIMX=" << threads_x
            << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
            << " -D init_To=" << toNumStr(mean.init())
            << " -D init_Tw=" << twNumStr(transform_weight(0))
            << " -D one_Tw=" << twNumStr(transform_weight(1));

        if (input_weight) { options << " -D INPUT_WEIGHT"; }
        if (output_weight) { options << " -D OUTPUT_WEIGHT"; }

        if (std::is_same<Ti, double>::value ||
                std::is_same<Ti, cdouble>::value ||
                std::is_same<To, double>::value) {
            options << " -D USE_DOUBLE";
        }

        const char *ker_strs[] = {mean_ops_cl, mean_first_cl};
        const int   ker_lens[] = {mean_ops_cl_len, mean_first_cl_len};
        Program prog;
        buildProgram(prog, 2, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker = new Kernel(*entry.prog, "mean_first_kernel");

        addKernelToCache(device, ref_name, entry);
    }

    NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
    NDRange global(groups_x * in.info.dims[2] * local[0],
            groups_y * in.info.dims[3] * local[1]);

    uint repeat = divup(in.info.dims[0], (local[0] * groups_x));

    if (input_weight && output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    } else if ( input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight &&  output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    }

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 20
0
void resize(Param out, const Param in) {
    typedef typename dtype_traits<T>::base_type BT;

    std::string refName = std::string("reorder_kernel_") +
                          std::string(dtype_traits<T>::getName()) +
                          std::to_string(method);

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();
        options << " -D VT=" << dtype_traits<vtype_t<T>>::getName();
        options << " -D WT=" << dtype_traits<wtype_t<BT>>::getName();

        switch (method) {
            case AF_INTERP_NEAREST: options << " -D INTERP=NEAREST"; break;
            case AF_INTERP_BILINEAR: options << " -D INTERP=BILINEAR"; break;
            case AF_INTERP_LOWER: options << " -D INTERP=LOWER"; break;
            default: break;
        }

        if ((af_dtype)dtype_traits<T>::af_type == c32 ||
            (af_dtype)dtype_traits<T>::af_type == c64) {
            options << " -D CPLX=1";
            options << " -D TB=" << dtype_traits<BT>::getName();
        } else {
            options << " -D CPLX=0";
        }

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {resize_cl};
        const int ker_lens[]   = {resize_cl_len};
        cl::Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new cl::Program(prog);
        entry.ker  = new cl::Kernel(*entry.prog, "resize_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto resizeOp =
        cl::KernelFunctor<cl::Buffer, const KParam, const cl::Buffer,
                          const KParam, const int, const int, const float,
                          const float>(*entry.ker);

    cl::NDRange local(RESIZE_TX, RESIZE_TY, 1);

    int blocksPerMatX = divup(out.info.dims[0], local[0]);
    int blocksPerMatY = divup(out.info.dims[1], local[1]);
    cl::NDRange global(local[0] * blocksPerMatX * in.info.dims[2],
                       local[1] * blocksPerMatY * in.info.dims[3], 1);

    double xd = (double)in.info.dims[0] / (double)out.info.dims[0];
    double yd = (double)in.info.dims[1] / (double)out.info.dims[1];

    float xf = (float)xd, yf = (float)yd;

    resizeOp(cl::EnqueueArgs(getQueue(), global, local), *out.data, out.info,
             *in.data, in.info, blocksPerMatX, blocksPerMatY, xf, yf);

    CL_DEBUG_FINISH(getQueue());
}
Exemplo n.º 21
0
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca,
              cl_mem dB, size_t dB_offset, int lddb, int incb,
              cl_command_queue queue) {
    std::string refName =
        std::string("swapdblk_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;

        options << " -D T=" << dtype_traits<T>::getName();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {swapdblk_cl};
        const int ker_lens[]   = {swapdblk_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "swapdblk");

        addKernelToCache(device, refName, entry);
    }

    int nblocks = n / nb;

    if (nblocks == 0) return;

    int info = 0;
    if (n < 0) {
        info = -1;
    } else if (nb < 1 || nb > 1024) {
        info = -2;
    } else if (ldda < (nblocks - 1) * nb * inca + nb) {
        info = -4;
    } else if (inca < 0) {
        info = -5;
    } else if (lddb < (nblocks - 1) * nb * incb + nb) {
        info = -7;
    } else if (incb < 0) {
        info = -8;
    }

    if (info != 0) {
        AF_ERROR("Invalid configuration", AF_ERR_INTERNAL);
        return;
    }

    NDRange local(nb);
    NDRange global(nblocks * nb);

    cl::Buffer dAObj(dA, true);
    cl::Buffer dBObj(dB, true);

    auto swapdOp =
        KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer,
                      unsigned long long, int, int>(*entry.ker);

    cl::CommandQueue q(queue);
    swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca,
            dBObj, dB_offset, lddb, incb);
}