Esempio n. 1
0
static void randomDistribution(cl::Buffer out, const size_t elements,
                               const af_random_engine_type type,
                               const uintl &seed, uintl &counter, int kerIdx) {
    uint elementsPerBlock = THREADS * 4 * sizeof(uint) / sizeof(T);
    uint groups           = divup(elements, elementsPerBlock);

    uint hi  = seed >> 32;
    uint lo  = seed;
    uint hic = counter >> 32;
    uint loc = counter;

    NDRange local(THREADS, 1);
    NDRange global(THREADS * groups, 1);

    if ((type == AF_RANDOM_ENGINE_PHILOX_4X32_10) ||
        (type == AF_RANDOM_ENGINE_THREEFRY_2X32_16)) {
        Kernel ker =
            get_random_engine_kernel<T>(type, kerIdx, elementsPerBlock);
        auto randomEngineOp =
            KernelFunctor<cl::Buffer, uint, uint, uint, uint, uint>(ker);
        randomEngineOp(EnqueueArgs(getQueue(), global, local), out, elements,
                       hic, loc, hi, lo);
    }

    counter += elements;
    CL_DEBUG_FINISH(getQueue());
}
Esempio n. 2
0
void initMersenneState(cl::Buffer state, cl::Buffer table, const uintl &seed) {
    NDRange local(THREADS_PER_GROUP, 1);
    NDRange global(local[0] * MAX_BLOCKS, 1);

    Kernel ker  = get_mersenne_init_kernel();
    auto initOp = KernelFunctor<cl::Buffer, cl::Buffer, uintl>(ker);
    initOp(EnqueueArgs(getQueue(), global, local), state, table, seed);
    CL_DEBUG_FINISH(getQueue());
}
void anisotropicDiffusion(Param inout, const float dt, const float mct,
                          const int fluxFnCode) {
    using cl::Buffer;
    using cl::EnqueueArgs;
    using cl::Kernel;
    using cl::KernelFunctor;
    using cl::NDRange;
    using cl::Program;

    std::string kerKeyStr = std::string("anisotropic_diffusion_") +
                            std::string(dtype_traits<T>::getName()) + "_" +
                            std::to_string(isMCDE);

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, kerKeyStr);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D SHRD_MEM_HEIGHT=" << (THREADS_X + 2)
                << " -D SHRD_MEM_WIDTH=" << (THREADS_Y + 2)
                << " -D IS_MCDE=" << isMCDE;
        if (std::is_same<T, double>::value) options << " -D USE_DOUBLE";

        const char *ker_strs[] = {anisotropic_diffusion_cl};
        const int ker_lens[]   = {anisotropic_diffusion_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "diffUpdate");
        addKernelToCache(device, kerKeyStr, entry);
    }

    auto diffUpdateOp =
        KernelFunctor<Buffer, KParam, float, float, int, unsigned, unsigned>(
            *entry.ker);

    NDRange threads(THREADS_X, THREADS_Y, 1);

    int blkX = divup(inout.info.dims[0], threads[0]);
    int blkY = divup(inout.info.dims[1], threads[1]);

    NDRange global(threads[0] * blkX * inout.info.dims[2],
                   threads[1] * blkY * inout.info.dims[3], 1);

    diffUpdateOp(EnqueueArgs(getQueue(), global, threads), *inout.data,
                 inout.info, dt, mct, fluxFnCode, blkX, blkY);

    CL_DEBUG_FINISH(getQueue());
}
Esempio n. 4
0
        void randomDistribution(cl::Buffer out, const size_t elements,
                cl::Buffer state, cl::Buffer pos, cl::Buffer sh1, cl::Buffer sh2,
                const uint mask, cl::Buffer recursion_table, cl::Buffer temper_table,
                int kerIdx)
        {
            int threads = THREADS;
            int min_elements_per_block = 32*THREADS*4*sizeof(uint)/sizeof(T);
            int blocks = divup(elements, min_elements_per_block);
            blocks = (blocks > MAX_BLOCKS)? MAX_BLOCKS : blocks;
            int elementsPerBlock = divup(elements, blocks);

            NDRange local(threads, 1);
            NDRange global(threads * blocks, 1);
            Kernel ker = get_random_engine_kernel<T>(AF_RANDOM_ENGINE_MERSENNE_GP11213, kerIdx, elementsPerBlock);
            auto randomEngineOp = KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
                  uint, cl::Buffer, cl::Buffer, uint, uint>(ker);
            randomEngineOp(EnqueueArgs(getQueue(), global, local),
                    out, state, pos, sh1, sh2, mask, recursion_table, temper_table, elementsPerBlock, elements);
            CL_DEBUG_FINISH(getQueue());
        }
Esempio n. 5
0
void convSep(Param out, const Param signal, const Param filter)
{
    try {

        const int fLen = filter.info.dims[0] * filter.info.dims[1];

        std::string ref_name =
            std::string("convsep_") +
            std::to_string(conv_dim) +
            std::string("_") +
            std::string(dtype_traits<T>::getName()) +
            std::string("_") +
            std::string(dtype_traits<accType>::getName()) +
            std::string("_") +
            std::to_string(expand) +
            std::string("_") +
            std::to_string(fLen);

        int device = getActiveDeviceId();
        kc_t::iterator idx = kernelCaches[device].find(ref_name);

        kc_entry_t entry;
        if (idx == kernelCaches[device].end()) {
            const size_t C0_SIZE  = (THREADS_X+2*(fLen-1))* THREADS_Y;
            const size_t C1_SIZE  = (THREADS_Y+2*(fLen-1))* THREADS_X;

            size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE);

            std::ostringstream options;
            options << " -D T=" << dtype_traits<T>::getName()
                    << " -D accType="<< dtype_traits<accType>::getName()
                    << " -D CONV_DIM="<< conv_dim
                    << " -D EXPAND="<< expand
                    << " -D FLEN="<< fLen
                    << " -D LOCAL_MEM_SIZE="<<locSize;
            if (std::is_same<T, double>::value ||
                std::is_same<T, cdouble>::value) {
                options << " -D USE_DOUBLE";
            }
            Program prog;
            buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str());

            entry.prog   = new Program(prog);
            entry.ker  = new Kernel(*entry.prog, "convolve");
            kernelCaches[device][ref_name] = entry;
        } else {
            entry = idx->second;
        }

        auto convOp = KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer,
                                  int, int>(*entry.ker);

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(out.info.dims[0], THREADS_X);
        int blk_y = divup(out.info.dims[1], THREADS_Y);

        NDRange global(blk_x*signal.info.dims[2]*THREADS_X,
                       blk_y*signal.info.dims[3]*THREADS_Y);

        cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType));
        // FIX ME: if the filter array is strided, direct might cause issues
        getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType));

        convOp(EnqueueArgs(getQueue(), global, local),
               *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y);

        bufferFree(mBuff);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Esempio n. 6
0
void
harris(unsigned* corners_out,
            Param &x_out,
            Param &y_out,
            Param &resp_out,
            Param in,
            const unsigned max_corners,
            const float min_response,
            const float sigma,
            const unsigned filter_len,
            const float k_thr)
{
    auto kernels = getHarrisKernels<T>();
    using cl::Buffer;
    using cl::EnqueueArgs;
    using cl::NDRange;


    // Window filter
    convAccT* h_filter = new convAccT[filter_len];
    // Decide between rectangular or circular filter
    if (sigma < 0.5f) {
        for (unsigned i = 0; i < filter_len; i++)
            h_filter[i] = (T)1.f / (filter_len);
    } else {
        gaussian1D<convAccT>(h_filter, (int)filter_len, sigma);
    }

    const unsigned border_len = filter_len / 2 + 1;

    // Copy filter to device object
    Array<convAccT> filter = createHostDataArray<convAccT>(filter_len, h_filter);
    Array<T> ix = createEmptyArray<T>(dim4(4, in.info.dims));
    Array<T> iy = createEmptyArray<T>(dim4(4, in.info.dims));

    // Compute first-order derivatives as gradients
    gradient<T>(iy, ix, in);

    Array<T> ixx = createEmptyArray<T>(dim4(4, in.info.dims));
    Array<T> ixy = createEmptyArray<T>(dim4(4, in.info.dims));
    Array<T> iyy = createEmptyArray<T>(dim4(4, in.info.dims));

    // Second order-derivatives kernel sizes
    const unsigned blk_x_so = divup(in.info.dims[3] * in.info.strides[3], HARRIS_THREADS_PER_GROUP);
    const NDRange local_so(HARRIS_THREADS_PER_GROUP, 1);
    const NDRange global_so(blk_x_so * HARRIS_THREADS_PER_GROUP, 1);

    auto soOp = KernelFunctor< Buffer, Buffer, Buffer,
                               unsigned, Buffer, Buffer > (*std::get<0>(kernels));

    // Compute second-order derivatives
    soOp(EnqueueArgs(getQueue(), global_so, local_so),
         *ixx.get(), *ixy.get(), *iyy.get(),
         in.info.dims[3] * in.info.strides[3], *ix.get(), *iy.get());
    CL_DEBUG_FINISH(getQueue());

    // Convolve second order derivatives with proper window filter
    conv_helper<T, convAccT>(ixx, ixy, iyy, filter);

    cl::Buffer *d_responses = bufferAlloc(in.info.dims[3] * in.info.strides[3] * sizeof(T));

    // Harris responses kernel sizes
    unsigned blk_x_hr = divup(in.info.dims[0] - border_len*2, HARRIS_THREADS_X);
    unsigned blk_y_hr = divup(in.info.dims[1] - border_len*2, HARRIS_THREADS_Y);
    const NDRange local_hr(HARRIS_THREADS_X, HARRIS_THREADS_Y);
    const NDRange global_hr(blk_x_hr * HARRIS_THREADS_X, blk_y_hr * HARRIS_THREADS_Y);

    auto hrOp = KernelFunctor< Buffer, unsigned, unsigned, Buffer, Buffer, Buffer,
                               float, unsigned> (*std::get<2>(kernels));

    // Calculate Harris responses for all pixels
    hrOp(EnqueueArgs(getQueue(), global_hr, local_hr),
         *d_responses, in.info.dims[0], in.info.dims[1],
         *ixx.get(), *ixy.get(), *iyy.get(), k_thr, border_len);
    CL_DEBUG_FINISH(getQueue());

    // Number of corners is not known a priori, limit maximum number of corners
    // according to image dimensions
    unsigned corner_lim = in.info.dims[3] * in.info.strides[3] * 0.2f;

    unsigned corners_found = 0;
    cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned));
    getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);

    cl::Buffer *d_x_corners = bufferAlloc(corner_lim * sizeof(float));
    cl::Buffer *d_y_corners = bufferAlloc(corner_lim * sizeof(float));
    cl::Buffer *d_resp_corners = bufferAlloc(corner_lim * sizeof(float));

    const float min_r = (max_corners > 0) ? 0.f : min_response;

    auto nmOp = KernelFunctor< Buffer, Buffer, Buffer, Buffer, Buffer, unsigned, unsigned,
                            float, unsigned, unsigned> (*std::get<3>(kernels));

    // Perform non-maximal suppression
    nmOp(EnqueueArgs(getQueue(), global_hr, local_hr),
         *d_x_corners, *d_y_corners, *d_resp_corners, *d_corners_found,
         *d_responses, in.info.dims[0], in.info.dims[1],
         min_r, border_len, corner_lim);
    CL_DEBUG_FINISH(getQueue());

    getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);

    bufferFree(d_responses);
    bufferFree(d_corners_found);

    *corners_out = min(corners_found, (max_corners > 0) ? max_corners : corner_lim);
    if (*corners_out == 0) return;

    // Set output Param info
    x_out.info.dims[0] = y_out.info.dims[0] = resp_out.info.dims[0] = *corners_out;
    x_out.info.strides[0] = y_out.info.strides[0] = resp_out.info.strides[0] = 1;
    x_out.info.offset = y_out.info.offset = resp_out.info.offset = 0;
    for (int k = 1; k < 4; k++) {
        x_out.info.dims[k] = y_out.info.dims[k] = resp_out.info.dims[k] =  1;
        x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
        y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
        resp_out.info.strides[k] = resp_out.info.dims[k - 1] * resp_out.info.strides[k - 1];
    }

    if (max_corners > 0 && corners_found > *corners_out) {
        Param harris_resp;
        Param harris_idx;

        harris_resp.info.dims[0] = harris_idx.info.dims[0] = corners_found;
        harris_resp.info.strides[0] = harris_idx.info.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            harris_resp.info.dims[k] = 1;
            harris_resp.info.strides[k] = harris_resp.info.dims[k - 1] * harris_resp.info.strides[k - 1];
            harris_idx.info.dims[k] = 1;
            harris_idx.info.strides[k] = harris_idx.info.dims[k - 1] * harris_idx.info.strides[k - 1];
        }

        int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3];
        harris_resp.data = d_resp_corners;
        // Create indices using range
        harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned));
        kernel::range<uint>(harris_idx, 0);

        // Sort Harris responses
        kernel::sort0ByKey<float, uint>(harris_resp, harris_idx, false);

        x_out.data = bufferAlloc(*corners_out * sizeof(float));
        y_out.data = bufferAlloc(*corners_out * sizeof(float));
        resp_out.data = bufferAlloc(*corners_out * sizeof(float));

        // Keep corners kernel sizes
        const unsigned blk_x_kc = divup(*corners_out, HARRIS_THREADS_PER_GROUP);
        const NDRange local_kc(HARRIS_THREADS_PER_GROUP, 1);
        const NDRange global_kc(blk_x_kc * HARRIS_THREADS_PER_GROUP, 1);

        auto kcOp = KernelFunctor< Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
                                   unsigned> (*std::get<1>(kernels));

        // Keep only the first corners_to_keep corners with higher Harris
        // responses
        kcOp(EnqueueArgs(getQueue(), global_kc, local_kc),
             *x_out.data, *y_out.data, *resp_out.data,
             *d_x_corners, *d_y_corners, *harris_resp.data, *harris_idx.data,
             *corners_out);
        CL_DEBUG_FINISH(getQueue());

        bufferFree(d_x_corners);
        bufferFree(d_y_corners);
        bufferFree(harris_resp.data);
        bufferFree(harris_idx.data);
    }
    else if (max_corners == 0 && corners_found < corner_lim) {
        x_out.data = bufferAlloc(*corners_out * sizeof(float));
        y_out.data = bufferAlloc(*corners_out * sizeof(float));
        resp_out.data = bufferAlloc(*corners_out * sizeof(float));
        getQueue().enqueueCopyBuffer(*d_x_corners, *x_out.data, 0, 0, *corners_out * sizeof(float));
        getQueue().enqueueCopyBuffer(*d_y_corners, *y_out.data, 0, 0, *corners_out * sizeof(float));
        getQueue().enqueueCopyBuffer(*d_resp_corners, *resp_out.data, 0, 0, *corners_out * sizeof(float));

        bufferFree(d_x_corners);
        bufferFree(d_y_corners);
        bufferFree(d_resp_corners);
    }
    else {
        x_out.data = d_x_corners;
        y_out.data = d_y_corners;
        resp_out.data = d_resp_corners;
    }
}
Esempio n. 7
0
void regions(Param out, Param in)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> regionsProgs;
        static std::map<int, Kernel *>     ilKernel;
        static std::map<int, Kernel *>     frKernel;
        static std::map<int, Kernel *>     ueKernel;

        int device = getActiveDeviceId();

        static const int block_dim = 16;
        static const int num_warps = 8;

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                if (full_conn) {
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D BLOCK_DIM=" << block_dim
                            << " -D NUM_WARPS=" << num_warps
                            << " -D N_PER_THREAD=" << n_per_thread
                            << " -D LIMIT_MAX=" << limit_max<T>()
                            << " -D FULL_CONN";
                }
                else {
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D BLOCK_DIM=" << block_dim
                            << " -D NUM_WARPS=" << num_warps
                            << " -D N_PER_THREAD=" << n_per_thread
                            << " -D LIMIT_MAX=" << limit_max<T>();
                }
                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                Program prog;
                buildProgram(prog, regions_cl, regions_cl_len, options.str());
                regionsProgs[device] = new Program(prog);

                ilKernel[device] = new Kernel(*regionsProgs[device], "initial_label");
                frKernel[device] = new Kernel(*regionsProgs[device], "final_relabel");
                ueKernel[device] = new Kernel(*regionsProgs[device], "update_equiv");
            });

        const NDRange local(THREADS_X, THREADS_Y);

        const int blk_x = divup(in.info.dims[0], THREADS_X*2);
        const int blk_y = divup(in.info.dims[1], THREADS_Y*2);

        const NDRange global(blk_x * THREADS_X, blk_y * THREADS_Y);

        auto ilOp = make_kernel<Buffer, KParam,
                                Buffer, KParam> (*ilKernel[device]);

        ilOp(EnqueueArgs(getQueue(), global, local),
             *out.data, out.info, *in.data, in.info);

        CL_DEBUG_FINISH(getQueue());

        int h_continue = 1;
        cl::Buffer *d_continue = bufferAlloc(sizeof(int));

        while (h_continue) {
            h_continue = 0;
            getQueue().enqueueWriteBuffer(*d_continue, CL_TRUE, 0, sizeof(int), &h_continue);

            auto ueOp = make_kernel<Buffer, KParam,
                                    Buffer> (*ueKernel[device]);

            ueOp(EnqueueArgs(getQueue(), global, local),
                 *out.data, out.info, *d_continue);
            CL_DEBUG_FINISH(getQueue());

            getQueue().enqueueReadBuffer(*d_continue, CL_TRUE, 0, sizeof(int), &h_continue);
        }

        bufferFree(d_continue);

        // Now, perform the final relabeling.  This converts the equivalency
        // map from having unique labels based on the lowest pixel in the
        // component to being sequentially numbered components starting at
        // 1.
        int size = in.info.dims[0] * in.info.dims[1];

        compute::command_queue c_queue(getQueue()());

        // Wrap raw device ptr
        compute::context context(getContext()());
        compute::vector<T> tmp(size, context);
        clEnqueueCopyBuffer(getQueue()(), (*out.data)(), tmp.get_buffer().get(), 0, 0, size * sizeof(T), 0, NULL, NULL);

        // Sort the copy
        compute::sort(tmp.begin(), tmp.end(), c_queue);

        // Take the max element, this is the number of label assignments to
        // compute.
        //int num_bins = tmp[size - 1] + 1;
        T last_label;
        clEnqueueReadBuffer(getQueue()(), tmp.get_buffer().get(), CL_TRUE, (size - 1) * sizeof(T), sizeof(T), &last_label, 0, NULL, NULL);
        int num_bins = (int)last_label + 1;

        Buffer labels(getContext(), CL_MEM_READ_WRITE, num_bins * sizeof(T));
        compute::buffer c_labels(labels());
        compute::buffer_iterator<T> labels_begin = compute::make_buffer_iterator<T>(c_labels, 0);
        compute::buffer_iterator<T> labels_end   = compute::make_buffer_iterator<T>(c_labels, num_bins);

        // Find the end of each section of values
        compute::counting_iterator<T> search_begin(0);

        int tmp_size = size;
        BOOST_COMPUTE_CLOSURE(int, upper_bound_closure, (int v), (tmp, tmp_size),
        {
            int start = 0, n = tmp_size, i;
            while(start < n)
            {
                i = (start + n) / 2;
                if(v < tmp[i])
                {
                    n = i;
                }
                else
                {
                    start = i + 1;
                }
            }

            return start;
        });

        BOOST_COMPUTE_FUNCTION(int, clamp_to_one, (int i),
        {
            return (i >= 1) ? 1 : i;
        });
Esempio n. 8
0
        // Perform the scan -- this can computes the correct labels for each
        // component
        compute::transform(labels_begin, labels_end,
                           labels_begin,
                           clamp_to_one,
                           c_queue);
        compute::exclusive_scan(labels_begin,
                                labels_end,
                                labels_begin,
                                c_queue);

        // Apply the correct labels to the equivalency map
        auto frOp = make_kernel<Buffer, KParam,
                                Buffer, KParam,
                                Buffer> (*frKernel[device]);

        //Buffer labels_buf(tmp.get_buffer().get());
        frOp(EnqueueArgs(getQueue(), global, local),
             *out.data, out.info, *in.data, in.info, labels);
        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}

} //namespace kernel

} //namespace opencl