Beispiel #1
0
void evalMultiple(vector<Array<T>*> array_ptrs)
{
    vector<Array<T>> arrays;
    vector<TNJ::Node_ptr> nodes;
    bool isWorker = getQueue().is_worker();
    for (auto &array : array_ptrs) {
        if (array->ready) continue;
        if (isWorker) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
        array->setId(getActiveDeviceId());
        array->data = shared_ptr<T>(memAlloc<T>(array->elements()).release(), memFree<T>);
        arrays.push_back(*array);
        nodes.push_back(array->node);
    }

    vector<Param<T>> params(arrays.begin(), arrays.end());
    if (arrays.size() > 0) {
        getQueue().enqueue(kernel::evalMultiple<T>, params, nodes);
        for (auto &array : array_ptrs) {
            if (array->ready) continue;
            array->ready = true;
            array->node = bufferNodePtr<T>();
        }
    }
    return;
}
Beispiel #2
0
Array<T>::Array(const af::dim4 &dims, Node_ptr n)
    : info(getActiveDeviceId(), dims, 0, calcStrides(dims),
           (af_dtype)dtype_traits<T>::af_type)
    , data()
    , data_dims(dims)
    , node(n)
    , ready(false)
    , owner(true) {}
Beispiel #3
0
Array<T>::Array(dim4 dims)
    : info(getActiveDeviceId(), dims, 0, calcStrides(dims),
           (af_dtype)dtype_traits<T>::af_type)
    , data(memAlloc<T>(dims.elements()).release(), memFree<T>)
    , data_dims(dims)
    , node(bufferNodePtr<T>())
    , ready(true)
    , owner(true) {}
Beispiel #4
0
 void memPush(const T *ptr)
 {
     int n = getActiveDeviceId();
     mem_iter iter = memory_maps[n].find((void *)ptr);
     if (iter != memory_maps[n].end()) {
         iter->second.is_unlinked = false;
     }
 }
Beispiel #5
0
        void select_launcher(Param out, Param cond, Param a, Param b, int ndims)
        {
            static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
            static std::map<int, Program*>  selProgs;
            static std::map<int, Kernel*> selKernels;

            int device = getActiveDeviceId();

            std::call_once(compileFlags[device], [device] () {

                    std::ostringstream options;
                    options << " -D is_same=" << is_same
                            << " -D T=" << dtype_traits<T>::getName();

                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }

                    cl::Program prog;
                    buildProgram(prog, select_cl, select_cl_len, options.str());
                    selProgs[device] = new Program(prog);

                    selKernels[device] = new Kernel(*selProgs[device], "select_kernel");
                });


            int threads[] = {DIMX, DIMY};

            if (ndims == 1) {
                threads[0] *= threads[1];
                threads[1] = 1;
            }

            NDRange local(threads[0],
                          threads[1]);


            int groups_0 = divup(out.info.dims[0], local[0]);
            int groups_1 = divup(out.info.dims[1], local[1]);

            NDRange global(groups_0 * out.info.dims[2] * local[0],
                           groups_1 * out.info.dims[3] * local[1]);

            auto selectOp = make_kernel<Buffer, KParam,
                                        Buffer, KParam,
                                        Buffer, KParam,
                                        Buffer, KParam,
                                        int, int>(*selKernels[device]);

            selectOp(EnqueueArgs(getQueue(), global, local),
                     *out.data, out.info,
                     *cond.data, cond.info,
                     *a.data, a.info,
                     *b.data, b.info,
                     groups_0, groups_1);

        }
Beispiel #6
0
    void bufferPush(cl::Buffer *ptr)
    {
        int n = getActiveDeviceId();
        mem_iter iter = memory_maps[n].find(ptr);

        if (iter != memory_maps[n].end()) {
            iter->second.is_unlinked = false;
        }
    }
Beispiel #7
0
 void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
                       size_t *lock_bytes,  size_t *lock_buffers)
 {
     int n = getActiveDeviceId();
     if (alloc_bytes   ) *alloc_bytes   = total_bytes[n];
     if (alloc_buffers ) *alloc_buffers = memory_maps[n].size();
     if (lock_bytes    ) *lock_bytes    = used_bytes[n];
     if (lock_buffers  ) *lock_buffers  = used_buffers[n];
 }
void convolve2(Param out, const Param signal, const Param filter)
{
    try {
        static std::once_flag  compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>   convProgs;
        static std::map<int, Kernel*>  convKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                const size_t C0_SIZE  = (THREADS_X+2*(fLen-1))* THREADS_Y;
                const size_t C1_SIZE  = (THREADS_Y+2*(fLen-1))* THREADS_X;

                size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE);

                    std::ostringstream options;
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D accType="<< dtype_traits<accType>::getName()
                            << " -D CONV_DIM="<< conv_dim
                            << " -D EXPAND="<< expand
                            << " -D FLEN="<< fLen
                            << " -D LOCAL_MEM_SIZE="<<locSize;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str());
                    convProgs[device]   = new Program(prog);
                    convKernels[device] = new Kernel(*convProgs[device], "convolve");
                });

        auto convOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer,
                                  int, int>(*convKernels[device]);

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(out.info.dims[0], THREADS_X);
        int blk_y = divup(out.info.dims[1], THREADS_Y);

        NDRange global(blk_x*signal.info.dims[2]*THREADS_X,
                       blk_y*signal.info.dims[3]*THREADS_Y);

        cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType));
        // FIX ME: if the filter array is strided, direct might cause issues
        getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType));

        convOp(EnqueueArgs(getQueue(), global, local),
               *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y);

        bufferFree(mBuff);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
        void sparseArithOpCSR(Param out, const Param values, const Param rowIdx, const Param colIdx,
                const Param rhs, const bool reverse)
        {
            std::string ref_name =
                std::string("sparseArithOpCSR_") +
                getOpString<op>() + std::string("_") +
                std::string(dtype_traits<T>::getName());

            int device = getActiveDeviceId();
            kc_entry_t entry = kernelCache(device, ref_name);

            if (entry.prog==0 && entry.ker==0) {

                std::ostringstream options;
                options << " -D T="  << dtype_traits<T>::getName();
                options << " -D OP=" << getOpString<op>();

                if((af_dtype) dtype_traits<T>::af_type == c32 ||
                        (af_dtype) dtype_traits<T>::af_type == c64) {
                    options << " -D IS_CPLX=1";
                } else {
                    options << " -D IS_CPLX=0";
                }
                if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                const char *ker_strs[] = {sparse_arith_common_cl    , sparse_arith_csr_cl};
                const int   ker_lens[] = {sparse_arith_common_cl_len, sparse_arith_csr_cl_len};

                Program prog;
                buildProgram(prog, 2, ker_strs, ker_lens, options.str());
                entry.prog = new Program(prog);
                entry.ker  = new Kernel(*entry.prog, "sparse_arith_csr_kernel");

                addKernelToCache(device, ref_name, entry);
            }

            auto sparseArithCSROp = KernelFunctor<Buffer, const KParam,
                 const Buffer, const Buffer, const Buffer,
                 const int,
                 const Buffer, const KParam,
                 const int>(*entry.ker);

            NDRange local(TX, TY, 1);
            NDRange global(divup(out.info.dims[0], TY) * TX, TY, 1);

            sparseArithCSROp(EnqueueArgs(getQueue(), global, local),
                    *out.data, out.info,
                    *values.data, *rowIdx.data, *colIdx.data, values.info.dims[0],
                    *rhs.data, rhs.info, reverse);

            CL_DEBUG_FINISH(getQueue());
        }
Beispiel #10
0
void matchTemplate(Param out, const Param srch, const Param tmplt)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>  mtProgs;
        static std::map<int, Kernel*> mtKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D inType="  << dtype_traits<inType>::getName()
                        << " -D outType=" << dtype_traits<outType>::getName()
                        << " -D MATCH_T=" << mType
                        << " -D NEEDMEAN="<< needMean
                        << " -D AF_SAD="  << AF_SAD
                        << " -D AF_ZSAD=" << AF_ZSAD
                        << " -D AF_LSAD=" << AF_LSAD
                        << " -D AF_SSD="  << AF_SSD
                        << " -D AF_ZSSD=" << AF_ZSSD
                        << " -D AF_LSSD=" << AF_LSSD
                        << " -D AF_NCC="  << AF_NCC
                        << " -D AF_ZNCC=" << AF_ZNCC
                        << " -D AF_SHD="  << AF_SHD;
                if (std::is_same<outType, double>::value) {
                    options << " -D USE_DOUBLE";
                }
                Program prog;
                buildProgram(prog, matchTemplate_cl, matchTemplate_cl_len, options.str());
                mtProgs[device]   = new Program(prog);
                mtKernels[device] = new Kernel(*mtProgs[device], "matchTemplate");
            });

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(srch.info.dims[0], THREADS_X);
        int blk_y = divup(srch.info.dims[1], THREADS_Y);

        NDRange global(blk_x * srch.info.dims[2] * THREADS_X, blk_y * srch.info.dims[3] * THREADS_Y);

        auto matchImgOp = make_kernel<Buffer, KParam,
                                       Buffer, KParam,
                                       Buffer, KParam,
                                       int, int> (*mtKernels[device]);

        matchImgOp(EnqueueArgs(getQueue(), global, local),
                    *out.data, out.info, *srch.data, srch.info, *tmplt.data, tmplt.info, blk_x, blk_y);

        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Beispiel #11
0
    void memcopy(cl::Buffer out, const dim_t *ostrides,
                 const cl::Buffer in, const dim_t *idims,
                 const dim_t *istrides, int offset, uint ndims)
    {
        try {
            static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
            static std::map<int, Program*>    cpyProgs;
            static std::map<int, Kernel*>   cpyKernels;

            int device = getActiveDeviceId();

            std::call_once(compileFlags[device], [&]() {
                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();
                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                Program prog;
                buildProgram(prog, memcopy_cl, memcopy_cl_len, options.str());
                cpyProgs[device]   = new Program(prog);
                cpyKernels[device] = new Kernel(*cpyProgs[device], "memcopy_kernel");
            });

            dims_t _ostrides = {{ostrides[0], ostrides[1], ostrides[2], ostrides[3]}};
            dims_t _istrides = {{istrides[0], istrides[1], istrides[2], istrides[3]}};
            dims_t _idims = {{idims[0], idims[1], idims[2], idims[3]}};

            size_t local_size[2] = {DIM0, DIM1};
            if (ndims == 1) {
                local_size[0] *= local_size[1];
                local_size[1]  = 1;
            }

            int groups_0 = divup(idims[0], local_size[0]);
            int groups_1 = divup(idims[1], local_size[1]);

            NDRange local(local_size[0], local_size[1]);
            NDRange global(groups_0 * idims[2] * local_size[0],
                           groups_1 * idims[3] * local_size[1]);

            auto memcopy_kernel = KernelFunctor< Buffer, dims_t,
                                               Buffer, dims_t,
                                               dims_t, int,
                                               int, int >(*cpyKernels[device]);

            memcopy_kernel(EnqueueArgs(getQueue(), global, local),
                out, _ostrides, in, _idims, _istrides, offset, groups_0, groups_1);
            CL_DEBUG_FINISH(getQueue());
        }
        catch (cl::Error err) {
            CL_TO_AF_ERROR(err);
            throw;
        }
    }
Beispiel #12
0
void transpose(Param out, const Param in)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>  trsProgs;
        static std::map<int, Kernel*> trsKernels;

        int device = getActiveDeviceId();

        std::call_once(compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D TILE_DIM=" << TILE_DIM
                        << " -D THREADS_Y=" << THREADS_Y
                        << " -D IS32MULTIPLE=" << IS32MULTIPLE
                        << " -D DOCONJUGATE=" << (conjugate && af::iscplx<T>())
                        << " -D T=" << dtype_traits<T>::getName();

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog, transpose_cl, transpose_cl_len, options.str());
                trsProgs[device] = new Program(prog);

                trsKernels[device] = new Kernel(*trsProgs[device], "transpose");
            });


        NDRange local(THREADS_X, THREADS_Y);

        dim_type blk_x = divup(in.info.dims[0], TILE_DIM);
        dim_type blk_y = divup(in.info.dims[1], TILE_DIM);

        // launch batch * blk_x blocks along x dimension
        NDRange global(blk_x * local[0] * in.info.dims[2],
                       blk_y * local[1]);

        auto transposeOp = make_kernel<Buffer, const KParam,
                                       const Buffer, const KParam,
                                       const dim_type> (*trsKernels[device]);

        transposeOp(EnqueueArgs(getQueue(), global, local),
                    *out.data, out.info, *in.data, in.info, blk_x);

        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Beispiel #13
0
void histogram(Param out, const Param in, const Param minmax, dim_type nbins)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> histProgs;
        static std::map<int, Kernel *> histKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D inType=" << dtype_traits<inType>::getName()
                            << " -D outType=" << dtype_traits<outType>::getName()
                            << " -D THRD_LOAD=" << THRD_LOAD;

                    if (std::is_same<inType, double>::value ||
                        std::is_same<inType, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }

                    Program prog;
                    buildProgram(prog, histogram_cl, histogram_cl_len, options.str());
                    histProgs[device]   = new Program(prog);
                    histKernels[device] = new Kernel(*histProgs[device], "histogram");
                });

        auto histogramOp = make_kernel<Buffer, KParam, Buffer, KParam,
                                       Buffer, cl::LocalSpaceArg,
                                       dim_type, dim_type, dim_type
                                      >(*histKernels[device]);

        NDRange local(THREADS_X, 1);

        dim_type numElements = in.info.dims[0]*in.info.dims[1];

        dim_type blk_x       = divup(numElements, THRD_LOAD*THREADS_X);

        dim_type batchCount  = in.info.dims[2];

        NDRange global(blk_x*THREADS_X, batchCount);

        dim_type locSize = nbins * sizeof(outType);

        histogramOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info, *in.data, in.info, *minmax.data,
                cl::Local(locSize), numElements, nbins, blk_x);

        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Beispiel #14
0
    void memFree(T *ptr)
    {
        int n = getActiveDeviceId();
        mem_iter iter = memory_maps[n].find((void *)ptr);

        if (iter != memory_maps[n].end()) {
            iter->second.is_free = true;
            used_bytes -= iter->second.bytes;
        } else {
            cudaFreeWrapper(ptr); // Free it because we are not sure what the size is
        }
    }
Beispiel #15
0
        void gradient(Param grad0, Param grad1, const Param in)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>  gradProgs;
                static std::map<int, Kernel*> gradKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D TX=" << TX
                            << " -D TY=" << TY;

                    if((af_dtype) dtype_traits<T>::af_type == c32 ||
                       (af_dtype) dtype_traits<T>::af_type == c64) {
                        options << " -D CPLX=1";
                    } else {
                        options << " -D CPLX=0";
                    }
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, gradient_cl, gradient_cl_len, options.str());
                    gradProgs[device]   = new Program(prog);
                    gradKernels[device] = new Kernel(*gradProgs[device], "gradient_kernel");
                });

                auto gradOp = make_kernel<Buffer, const KParam, Buffer, const KParam,
                                    const Buffer, const KParam, const dim_type, const dim_type>
                                        (*gradKernels[device]);

                NDRange local(TX, TY, 1);

                dim_type blocksPerMatX = divup(in.info.dims[0], TX);
                dim_type blocksPerMatY = divup(in.info.dims[1], TY);
                NDRange global(local[0] * blocksPerMatX * in.info.dims[2],
                               local[1] * blocksPerMatY * in.info.dims[3],
                               1);

                gradOp(EnqueueArgs(getQueue(), global, local),
                        *grad0.data, grad0.info, *grad1.data, grad1.info,
                        *in.data, in.info, blocksPerMatX, blocksPerMatY);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
Beispiel #16
0
        void diff(Param out, const Param in, const unsigned indims)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>   diffProgs;
                static std::map<int, Kernel*>  diffKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D T="        << dtype_traits<T>::getName()
                            << " -D DIM="      << dim
                            << " -D isDiff2=" << isDiff2;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, diff_cl, diff_cl_len, options.str());
                    diffProgs[device]   = new Program(prog);
                    diffKernels[device] = new Kernel(*diffProgs[device], "diff_kernel");
                });

                auto diffOp = make_kernel<Buffer, const Buffer, const KParam, const KParam,
                                          const dim_type, const dim_type, const dim_type>
                                          (*diffKernels[device]);

                NDRange local(TX, TY, 1);
                if(dim == 0 && indims == 1) {
                    local = NDRange(TX * TY, 1, 1);
                }

                dim_type blocksPerMatX = divup(out.info.dims[0], local[0]);
                dim_type blocksPerMatY = divup(out.info.dims[1], local[1]);
                NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                               local[1] * blocksPerMatY * out.info.dims[3],
                               1);

                const dim_type oElem = out.info.dims[0] * out.info.dims[1]
                                     * out.info.dims[2] * out.info.dims[3];

                diffOp(EnqueueArgs(getQueue(), global, local),
                       *out.data, *in.data, out.info, in.info,
                       oElem, blocksPerMatX, blocksPerMatY);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
Beispiel #17
0
    void pinnedBufferFree(void *ptr)
    {
        int n = getActiveDeviceId();
        pinned_iter iter = pinned_maps[n].find(ptr);

        if (iter != pinned_maps[n].end()) {
            iter->second.info.is_free = true;
            pinned_used_bytes -= iter->second.info.bytes;
        } else {
            pinnedDestroy(iter->second.buf, ptr); // Free it because we are not sure what the size is
            pinned_maps[n].erase(iter);
        }
    }
Beispiel #18
0
Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
    info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
    data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()).release(), memFree<T>), data_dims(dims),
    node(bufferNodePtr<T>()), ready(true), owner(true)
{
    static_assert(is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
    static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
    if (!is_device || copy_device) {
        // Ensure the memory being written to isnt used anywhere else.
        getQueue().sync();
        copy(in_data, in_data + dims.elements(), data.get());
    }
}
Beispiel #19
0
    void randu(T *out, size_t elements)
    {
        int device = getActiveDeviceId();

        int threads = THREADS;
        int blocks  = divup(elements, THREADS);
        if (blocks > BLOCKS) blocks = BLOCKS;

        curandState_t *state = getcurandState();

        CUDA_LAUNCH(uniform_kernel, blocks, threads, out, state, elements);
        POST_LAUNCH_CHECK();
    }
void morph3d(Param       out,
        const Param      in,
        const Param      mask)
{
    std::string refName = std::string("morph3d_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(isDilation) + std::to_string(SeLength);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::string options = generateOptionsString<T, isDilation, SeLength>();
        const char* ker_strs[] = {morph_cl};
        const int   ker_lens[] = {morph_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options);
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "morph3d");
        addKernelToCache(device, refName, entry);
    }

    auto morphOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer,
                                  cl::LocalSpaceArg, int >(*entry.ker);

    NDRange local(CUBE_X, CUBE_Y, CUBE_Z);

    int blk_x = divup(in.info.dims[0], CUBE_X);
    int blk_y = divup(in.info.dims[1], CUBE_Y);
    int blk_z = divup(in.info.dims[2], CUBE_Z);
    // launch batch * blk_x blocks along x dimension
    NDRange global(blk_x * CUBE_X * in.info.dims[3], blk_y * CUBE_Y, blk_z * CUBE_Z);

    // copy mask/filter to constant memory
    cl_int se_size   = sizeof(T)*SeLength*SeLength*SeLength;
    cl::Buffer *mBuff = bufferAlloc(se_size);
    getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size);

    // calculate shared memory size
    const int padding = (SeLength%2==0 ? (SeLength-1) : (2*(SeLength/2)));
    const int locLen  = CUBE_X+padding+1;
    const int locArea = locLen *(CUBE_Y+padding);
    const int locSize = locArea*(CUBE_Z+padding);

    morphOp(EnqueueArgs(getQueue(), global, local),
            *out.data, out.info, *in.data, in.info,
            *mBuff, cl::Local(locSize*sizeof(T)), blk_x);

    bufferFree(mBuff);
    CL_DEBUG_FINISH(getQueue());
}
Beispiel #21
0
void Array<T>::eval() {
    if (isReady()) return;
    if (getQueue().is_worker())
        AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);

    this->setId(getActiveDeviceId());

    data = shared_ptr<T>(memAlloc<T>(elements()).release(), memFree<T>);

    getQueue().enqueue(kernel::evalArray<T>, *this, this->node);
    // Reset shared_ptr
    this->node = bufferNodePtr<T>();
    ready      = true;
}
Beispiel #22
0
unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
                    const unsigned idim0, const unsigned idim1,
                    const cl::Buffer* resp_in, const unsigned edge,
                    const unsigned max_corners) {
    unsigned corners_found = 0;

    std::string refName =
        std::string("non_maximal_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D NONMAX";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {susan_cl};
        const int ker_lens[]   = {susan_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "non_maximal");

        addKernelToCache(device, refName, entry);
    }

    cl::Buffer* d_corners_found = bufferAlloc(sizeof(unsigned));
    getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0,
                                  sizeof(unsigned), &corners_found);

    auto nonMaximalOp =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, unsigned, unsigned,
                      Buffer, unsigned, unsigned>(*entry.ker);

    NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
    NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0],
                   divup(idim1 - 2 * edge, local[1]) * local[1]);

    nonMaximalOp(EnqueueArgs(getQueue(), global, local), *x_out, *y_out,
                 *resp_out, *d_corners_found, idim0, idim1, *resp_in, edge,
                 max_corners);

    getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned),
                                 &corners_found);
    bufferFree(d_corners_found);

    return corners_found;
}
Beispiel #23
0
void diff(Param out, const Param in, const unsigned indims)
{
    std::string refName = std::string("diff_kernel_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(dim) +
        std::to_string(isDiff2);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T="        << dtype_traits<T>::getName()
                << " -D DIM="      << dim
                << " -D isDiff2=" << isDiff2;
        if (std::is_same<T, double>::value ||
            std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {diff_cl};
        const int   ker_lens[] = {diff_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "diff_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto diffOp = KernelFunctor< Buffer, const Buffer, const KParam, const KParam,
                                 const int, const int, const int> (*entry.ker);

    NDRange local(TX, TY, 1);
    if(dim == 0 && indims == 1) {
        local = NDRange(TX * TY, 1, 1);
    }

    int blocksPerMatX = divup(out.info.dims[0], local[0]);
    int blocksPerMatY = divup(out.info.dims[1], local[1]);
    NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                   local[1] * blocksPerMatY * out.info.dims[3], 1);

    const int oElem = out.info.dims[0] * out.info.dims[1] * out.info.dims[2] * out.info.dims[3];

    diffOp(EnqueueArgs(getQueue(), global, local),
           *out.data, *in.data, out.info, in.info, oElem, blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
Beispiel #24
0
void medfilt2(Param out, const Param in) {
    std::string refName =
        std::string("medfilt2_") + std::string(dtype_traits<T>::getName()) +
        std::to_string(pad) + std::to_string(w_len) + std::to_string(w_wid);

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        const int ARR_SIZE = w_len * (w_wid - w_wid / 2);

        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D pad=" << pad
                << " -D AF_PAD_ZERO=" << AF_PAD_ZERO
                << " -D AF_PAD_SYM=" << AF_PAD_SYM
                << " -D ARR_SIZE=" << ARR_SIZE << " -D w_len=" << w_len
                << " -D w_wid=" << w_wid;
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {medfilt2_cl};
        const int ker_lens[]   = {medfilt2_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "medfilt2");

        addKernelToCache(device, refName, entry);
    }

    NDRange local(THREADS_X, THREADS_Y);

    int blk_x = divup(in.info.dims[0], THREADS_X);
    int blk_y = divup(in.info.dims[1], THREADS_Y);

    NDRange global(blk_x * in.info.dims[2] * THREADS_X,
                   blk_y * in.info.dims[3] * THREADS_Y);

    auto medfiltOp = KernelFunctor<Buffer, KParam, Buffer, KParam,
                                   cl::LocalSpaceArg, int, int>(*entry.ker);

    size_t loc_size =
        (THREADS_X + w_len - 1) * (THREADS_Y + w_wid - 1) * sizeof(T);

    medfiltOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info,
              *in.data, in.info, cl::Local(loc_size), blk_x, blk_y);

    CL_DEBUG_FINISH(getQueue());
}
Beispiel #25
0
        void join(Param out, const Param X, const Param Y)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>   joinProgs;
                static std::map<int, Kernel *> joinKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    std::ostringstream options;
                    options << " -D Tx=" << dtype_traits<Tx>::getName()
                            << " -D Ty=" << dtype_traits<Ty>::getName()
                            << " -D dim=" << dim;

                    if (std::is_same<Tx, double>::value ||
                        std::is_same<Tx, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    } else if (std::is_same<Tx, double>::value ||
                        std::is_same<Tx, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }

                    Program prog;
                    buildProgram(prog, join_cl, join_cl_len, options.str());
                    joinProgs[device] = new Program(prog);
                    joinKernels[device] = new Kernel(*joinProgs[device], "join_kernel");
                });

                auto joinOp = make_kernel<Buffer, const KParam, const Buffer, const KParam,
                              const Buffer, const KParam, const dim_type, const dim_type> (*joinKernels[device]);

                NDRange local(TX, TY, 1);

                dim_type blocksPerMatX = divup(out.info.dims[0], TILEX);
                dim_type blocksPerMatY = divup(out.info.dims[1], TILEY);
                NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                               local[1] * blocksPerMatY * out.info.dims[3],
                               1);

                joinOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info,
                       *X.data, X.info, *Y.data, Y.info, blocksPerMatX, blocksPerMatY);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
Beispiel #26
0
    void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
               const Array<Tk> &ikey, const Array<Tv> &ival, const unsigned dim)
    {
        if ((std::is_same<Tk, double>::value || std::is_same<Tk, cdouble>::value) &&
            !isDoubleSupported(getActiveDeviceId())) {
            OPENCL_NOT_SUPPORTED();
        }
        if ((std::is_same<Tv, double>::value || std::is_same<Tv, cdouble>::value) &&
            !isDoubleSupported(getActiveDeviceId())) {
            OPENCL_NOT_SUPPORTED();
        }

        try {
            okey = copyArray<Tk>(ikey);
            oval = copyArray<Tv>(ival);
            switch(dim) {
            case 0: kernel::sort0_by_key<Tk, Tv, isAscending>(okey, oval);
                break;
            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
            }
        }catch(std::exception &ex) {
            AF_ERROR(ex.what(), AF_ERR_INTERNAL);
        }
    }
Beispiel #27
0
Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset_,
                const T * const in_data, bool is_device) :
    info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type),
    data(is_device ? (T*)in_data : memAlloc<T>(info.total()).release(), memFree<T>),
    data_dims(dims),
    node(bufferNodePtr<T>()),
    ready(true),
    owner(true)
{
    if (!is_device) {
        // Ensure the memory being written to isnt used anywhere else.
        getQueue().sync();
        copy(in_data, in_data + info.total(), data.get());
    }
}
Beispiel #28
0
void join(Param out, const Param in, const af::dim4 offset)
{
    std::string refName = std::string("join_kernel_") +
        std::string(dtype_traits<To>::getName()) +
        std::string(dtype_traits<Ti>::getName()) +
        std::to_string(dim);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D To=" << dtype_traits<To>::getName()
                << " -D Ti=" << dtype_traits<Ti>::getName()
                << " -D dim=" << dim;

        if (std::is_same<To, double>::value || std::is_same<To, cdouble>::value) {
            options << " -D USE_DOUBLE";
        } else if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }

        const char* ker_strs[] = {join_cl};
        const int   ker_lens[] = {join_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "join_kernel");

        addKernelToCache(device, refName, entry);
    }

    auto joinOp = KernelFunctor<Buffer, const KParam, const Buffer, const KParam,
                                const int, const int, const int, const int,
                                const int, const int> (*entry.ker);

    NDRange local(TX, TY, 1);

    int blocksPerMatX = divup(in.info.dims[0], TILEX);
    int blocksPerMatY = divup(in.info.dims[1], TILEY);
    NDRange global(local[0] * blocksPerMatX * in.info.dims[2],
                   local[1] * blocksPerMatY * in.info.dims[3], 1);

    joinOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info,
           offset[0], offset[1], offset[2], offset[3], blocksPerMatX, blocksPerMatY);

    CL_DEBUG_FINISH(getQueue());
}
Beispiel #29
0
static Kernel* get_scan_dim_kernels(int kerIdx)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> scanProgs;
        static std::map<int, Kernel*>  scanKerns;
        static std::map<int, Kernel*>  bcastKerns;

        int device= getActiveDeviceId();

        std::call_once(compileFlags[device], [device] () {

            Binary<To, op> scan;
            ToNum<To> toNum;

            std::ostringstream options;
            options << " -D To=" << dtype_traits<To>::getName()
                    << " -D Ti=" << dtype_traits<Ti>::getName()
                    << " -D T=To"
                    << " -D dim=" << dim
                    << " -D DIMY=" << threads_y
                    << " -D THREADS_X=" << THREADS_X
                    << " -D init=" << toNum(scan.init())
                    << " -D " << binOpName<op>()
                    << " -D CPLX=" << af::iscplx<Ti>()
                    << " -D isFinalPass="******" -D USE_DOUBLE";
            }

            const char *ker_strs[] = {ops_cl, scan_dim_cl};
            const int   ker_lens[] = {ops_cl_len, scan_dim_cl_len};
            cl::Program prog;
            buildProgram(prog, 2, ker_strs, ker_lens, options.str());
            scanProgs[device] = new Program(prog);

            scanKerns[device] = new Kernel(*scanProgs[device],  "scan_dim_kernel");
            bcastKerns[device] = new Kernel(*scanProgs[device],  "bcast_dim_kernel");

        });

        return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter)
{
    std::string ref_name = std::string("convolveND_") +
        std::string(dtype_traits<T>::getName()) + std::string(dtype_traits<aT>::getName()) +
        std::to_string(bDim) + std::to_string(expand);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T="         << dtype_traits<T>::getName()
                << " -D Ti="        << dtype_traits<T>::getName()
                << " -D To="        << dtype_traits<aT>::getName()
                << " -D accType="   << dtype_traits<aT>::getName()
                << " -D BASE_DIM="  << bDim
                << " -D EXPAND="    << expand
                << " -D "           << binOpName<af_mul_t>();

        if((af_dtype) dtype_traits<T>::af_type == c32 ||
            (af_dtype) dtype_traits<T>::af_type == c64) {
            options << " -D CPLX=1";
        } else {
            options << " -D CPLX=0";
        }
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char *ker_strs[] = {ops_cl, convolve_cl};
        const int   ker_lens[] = {ops_cl_len, convolve_cl_len};
        Program prog;
        buildProgram(prog, 2, ker_strs, ker_lens, options.str());

        entry.prog   = new Program(prog);
        entry.ker = new Kernel(*entry.prog, "convolve");

        addKernelToCache(device, ref_name, entry);
    }

    auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, Buffer, KParam,
                                    int, int, int, int, int, int, int, int >(*entry.ker);

    convOp(EnqueueArgs(getQueue(), param.global, param.local),
           *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size),
           *param.impulse, filter.info, param.nBBS0, param.nBBS1,
           param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]);
}