void vkeGameRendererDynamic::setNodeData(VkeNodeData::List *inData){
	m_node_data = inData;
	if (m_node_data != NULL){

		uint32_t cnt = m_node_data->count();
		uint32_t transformsSize = 64 * 64;

		uint32_t sz = sizeof(VkeNodeUniform) * 100;
		sz += (transformsSize);

		VkBufferUsageFlags usageFlags = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;

		bufferCreate(&m_uniforms_buffer_staging, sz, (VkBufferUsageFlagBits)usageFlags);
		bufferAlloc(&m_uniforms_buffer_staging, &m_uniforms_staging, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);

		bufferCreate(&m_uniforms_buffer, sz, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
		bufferAlloc(&m_uniforms_buffer, &m_uniforms_memory, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);


		m_uniforms_descriptor.buffer = m_uniforms_buffer;
		m_uniforms_descriptor.offset = 0;
		m_uniforms_descriptor.range = sizeof(VkeNodeUniform) * 100;

		m_transforms_descriptor.buffer = m_uniforms_buffer;
		m_transforms_descriptor.offset = sizeof(VkeNodeUniform) * 100;
		m_transforms_descriptor.range = transformsSize; //(4 * 64);
	}
}
Esempio n. 2
0
void mean_first(Param out, Param in, Param inWeight)
{
    uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0]));
    threads_x = std::min(threads_x, THREADS_PER_GROUP);
    uint threads_y = THREADS_PER_GROUP / threads_x;

    uint groups_x = divup(in.info.dims[0], threads_x * REPEAT);
    uint groups_y = divup(in.info.dims[1], threads_y);

    Param tmpOut = out;
    Param noWeight;
    noWeight.info.offset = 0;
    for (int k = 0; k < 4; ++k) {
        noWeight.info.dims[k] = 0;
        noWeight.info.strides[k] = 0;
    }
    // Does not matter what the value is it will not be used. Just needs to be valid.
    noWeight.data = inWeight.data;

    Param tmpWeight = noWeight;

    if (groups_x > 1) {

        tmpOut.data = bufferAlloc(groups_x *
                in.info.dims[1] *
                in.info.dims[2] *
                in.info.dims[3] *
                sizeof(To));

        tmpWeight.data = bufferAlloc(groups_x *
                in.info.dims[1] *
                in.info.dims[2] *
                in.info.dims[3] *
                sizeof(Tw));


        tmpOut.info.dims[0] = groups_x;
        for (int k = 1; k < 4; k++) tmpOut.info.strides[k] *= groups_x;
        tmpWeight.info = tmpOut.info;
    }

    mean_first_launcher<Ti, Tw, To>(tmpOut, tmpWeight, in, inWeight, threads_x, groups_x, groups_y);

    if (groups_x > 1) {
        // No Weight is needed when writing out the output.
        mean_first_launcher<Ti, Tw, To>(out, noWeight, tmpOut, tmpWeight, threads_x, 1, groups_y);

        bufferFree(tmpOut.data);
        bufferFree(tmpWeight.data);
    }
}
void vkeGameRendererDynamic::setNodeData(VkeNodeData::List *inData){
	m_node_data = inData;
	if (m_node_data != NULL){

		uint32_t cnt = m_node_data->count();
		uint32_t transformsSize = 64 * m_instance_count;

		uint32_t sz = sizeof(VkeNodeUniform) * cnt;
		sz += (transformsSize);

		m_uniforms_local = (float*)malloc( sz);

		bufferCreate(&m_uniforms_buffer, sz, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
		bufferAlloc(&m_uniforms_buffer, &m_uniforms_memory, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);


		m_uniforms_descriptor.buffer = m_uniforms_buffer;
		m_uniforms_descriptor.offset = 0;
		m_uniforms_descriptor.range = sizeof(VkeNodeUniform) * cnt;

		m_transforms_descriptor.buffer = m_uniforms_buffer;
		m_transforms_descriptor.offset = sizeof(VkeNodeUniform) * cnt;
		m_transforms_descriptor.range = transformsSize; 
	}
}
void vkeGameRendererDynamic::setMaterialData(VkeMaterial::List *inData){
	m_materials = inData;

	if (m_materials != NULL){

		uint32_t cnt = m_materials->count();
		uint32_t sz = sizeof(VkeMaterialUniform) * cnt;

		VkBufferUsageFlags usageFlags = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;

		bufferCreate(&m_material_buffer_staging, sz, (VkBufferUsageFlagBits)usageFlags);
		bufferAlloc(&m_material_buffer_staging, &m_material_staging, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT);

		VkeMaterialUniform *uniforms = NULL;

		VKA_CHECK_ERROR(vkMapMemory(getDefaultDevice(), m_material_staging, 0, sz, 0, (void **)&uniforms), "Could not map buffer memory.\n");

		for (uint32_t i = 0; i < cnt; ++i){
			VkeMaterial *mat = m_materials->getMaterial(i);
			mat->initVKBufferData(m_material_buffer_staging);
			mat->updateVKBufferData(uniforms);
		}

		vkUnmapMemory(getDefaultDevice(), m_material_staging);
	}

}
Esempio n. 5
0
    static void scan_dim(Param &out, const Param &in, int dim)
    {
        uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim]));
        uint threads_x = THREADS_X;

        uint groups_all[] = {divup((uint)out.info.dims[0], threads_x),
                              (uint)out.info.dims[1],
                              (uint)out.info.dims[2],
                              (uint)out.info.dims[3]};

        groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT);

        if (groups_all[dim] == 1) {

            scan_dim_launcher<Ti, To, op, inclusive_scan>(out, out, in,
                                          dim, true,
                                          threads_y,
                                          groups_all);
        } else {

            Param tmp = out;

            tmp.info.dims[dim] = groups_all[dim];
            tmp.info.strides[0] = 1;
            for (int k = 1; k < 4; k++) {
                tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1];
            }

            int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
            // FIXME: Do I need to free this ?
            tmp.data = bufferAlloc(tmp_elements * sizeof(To));

            scan_dim_launcher<Ti, To, op, inclusive_scan>(out, tmp, in,
                                          dim, false,
                                          threads_y,
                                          groups_all);

            int gdim = groups_all[dim];
            groups_all[dim] = 1;

            if (op == af_notzero_t) {
                scan_dim_launcher<To, To, af_add_t, true>(tmp, tmp, tmp,
                                                    dim, true,
                                                    threads_y,
                                                    groups_all);
            } else {
                scan_dim_launcher<To, To,       op, true>(tmp, tmp, tmp,
                                                    dim, true,
                                                    threads_y,
                                                    groups_all);
            }

            groups_all[dim] = gdim;
            bcast_dim_launcher<To, To, op, inclusive_scan>(out, tmp,
                                            dim, true,
                                            threads_y,
                                            groups_all);
            bufferFree(tmp.data);
        }
    }
Esempio n. 6
0
void conv2(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt)
{
    size_t se_size = filt.info.dims[0] * filt.info.dims[1] * sizeof(aT);
    p.impulse = bufferAlloc(se_size);
    int f0Off = filt.info.offset;

    for (int b3=0; b3<filt.info.dims[3]; ++b3) {
        int f3Off = b3 * filt.info.strides[3];

        for (int b2=0; b2<filt.info.dims[2]; ++b2) {
            int f2Off = b2 * filt.info.strides[2];

            // FIXME: if the filter array is strided, direct copy of symbols
            // might cause issues
            getQueue().enqueueCopyBuffer(*filt.data, *p.impulse,
                                         (f2Off+f3Off+f0Off)*sizeof(aT),
                                         0, se_size);

            p.o[1] = (p.outHasNoOffset ? 0 : b2);
            p.o[2] = (p.outHasNoOffset ? 0 : b3);
            p.s[1] = (p.inHasNoOffset ? 0 : b2);
            p.s[2] = (p.inHasNoOffset ? 0 : b3);

            conv2Helper<T, aT, expand>(p, out, sig, filt);
        }
    }
}
void convolve2(Param out, const Param signal, const Param filter)
{
    try {
        static std::once_flag  compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>   convProgs;
        static std::map<int, Kernel*>  convKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                const size_t C0_SIZE  = (THREADS_X+2*(fLen-1))* THREADS_Y;
                const size_t C1_SIZE  = (THREADS_Y+2*(fLen-1))* THREADS_X;

                size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE);

                    std::ostringstream options;
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D accType="<< dtype_traits<accType>::getName()
                            << " -D CONV_DIM="<< conv_dim
                            << " -D EXPAND="<< expand
                            << " -D FLEN="<< fLen
                            << " -D LOCAL_MEM_SIZE="<<locSize;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str());
                    convProgs[device]   = new Program(prog);
                    convKernels[device] = new Kernel(*convProgs[device], "convolve");
                });

        auto convOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer,
                                  int, int>(*convKernels[device]);

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(out.info.dims[0], THREADS_X);
        int blk_y = divup(out.info.dims[1], THREADS_Y);

        NDRange global(blk_x*signal.info.dims[2]*THREADS_X,
                       blk_y*signal.info.dims[3]*THREADS_Y);

        cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType));
        // FIX ME: if the filter array is strided, direct might cause issues
        getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType));

        convOp(EnqueueArgs(getQueue(), global, local),
               *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y);

        bufferFree(mBuff);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Esempio n. 8
0
Array<T> index(const Array<T>& in, const af_index_t idxrs[])
{
    kernel::IndexKernelParam_t p;
    std::vector<af_seq> seqs(4, af_span);
    // create seq vector to retrieve output
    // dimensions, offsets & offsets
    for (dim_t x=0; x<4; ++x) {
        if (idxrs[x].isSeq) {
            seqs[x] = idxrs[x].idx.seq;
        }
    }

    // retrieve dimensions, strides and offsets
    dim4 iDims = in.dims();
    dim4 dDims = in.getDataDims();
    dim4 oDims = toDims  (seqs, iDims);
    dim4 iOffs = toOffset(seqs, dDims);
    dim4 iStrds= toStride(seqs, dDims);

    for (dim_t i=0; i<4; ++i) {
        p.isSeq[i] = idxrs[i].isSeq;
        p.offs[i]  = iOffs[i];
        p.strds[i] = iStrds[i];
    }

    Buffer* bPtrs[4];

    std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
    // look through indexs to read af_array indexs
    for (dim_t x=0; x<4; ++x) {
        // set index pointers were applicable
        if (!p.isSeq[x]) {
            idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
            bPtrs[x] = idxArrs[x].get();
            // set output array ith dimension value
            oDims[x] = idxArrs[x].elements();
        }
        else {
            // alloc an 1-element buffer to avoid OpenCL from failing
            bPtrs[x] = bufferAlloc(sizeof(uint));
        }
    }

    Array<T> out = createEmptyArray<T>(oDims);
    if(oDims.elements() == 0) { return out; }

    kernel::index<T>(out, in, p, bPtrs);

    for (dim_t x=0; x<4; ++x) {
        if (p.isSeq[x]) bufferFree(bPtrs[x]);
    }

    return out;
}
Esempio n. 9
0
void morph3d(Param       out,
        const Param      in,
        const Param      mask)
{
    std::string refName = std::string("morph3d_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(isDilation) + std::to_string(SeLength);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::string options = generateOptionsString<T, isDilation, SeLength>();
        const char* ker_strs[] = {morph_cl};
        const int   ker_lens[] = {morph_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options);
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "morph3d");
        addKernelToCache(device, refName, entry);
    }

    auto morphOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer,
                                  cl::LocalSpaceArg, int >(*entry.ker);

    NDRange local(CUBE_X, CUBE_Y, CUBE_Z);

    int blk_x = divup(in.info.dims[0], CUBE_X);
    int blk_y = divup(in.info.dims[1], CUBE_Y);
    int blk_z = divup(in.info.dims[2], CUBE_Z);
    // launch batch * blk_x blocks along x dimension
    NDRange global(blk_x * CUBE_X * in.info.dims[3], blk_y * CUBE_Y, blk_z * CUBE_Z);

    // copy mask/filter to constant memory
    cl_int se_size   = sizeof(T)*SeLength*SeLength*SeLength;
    cl::Buffer *mBuff = bufferAlloc(se_size);
    getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size);

    // calculate shared memory size
    const int padding = (SeLength%2==0 ? (SeLength-1) : (2*(SeLength/2)));
    const int locLen  = CUBE_X+padding+1;
    const int locArea = locLen *(CUBE_Y+padding);
    const int locSize = locArea*(CUBE_Z+padding);

    morphOp(EnqueueArgs(getQueue(), global, local),
            *out.data, out.info, *in.data, in.info,
            *mBuff, cl::Local(locSize*sizeof(T)), blk_x);

    bufferFree(mBuff);
    CL_DEBUG_FINISH(getQueue());
}
Esempio n. 10
0
unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
                    const unsigned idim0, const unsigned idim1,
                    const cl::Buffer* resp_in, const unsigned edge,
                    const unsigned max_corners) {
    unsigned corners_found = 0;

    std::string refName =
        std::string("non_maximal_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName() << " -D NONMAX";
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {susan_cl};
        const int ker_lens[]   = {susan_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "non_maximal");

        addKernelToCache(device, refName, entry);
    }

    cl::Buffer* d_corners_found = bufferAlloc(sizeof(unsigned));
    getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0,
                                  sizeof(unsigned), &corners_found);

    auto nonMaximalOp =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, unsigned, unsigned,
                      Buffer, unsigned, unsigned>(*entry.ker);

    NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
    NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0],
                   divup(idim1 - 2 * edge, local[1]) * local[1]);

    nonMaximalOp(EnqueueArgs(getQueue(), global, local), *x_out, *y_out,
                 *resp_out, *d_corners_found, idim0, idim1, *resp_in, edge,
                 max_corners);

    getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned),
                                 &corners_found);
    bufferFree(d_corners_found);

    return corners_found;
}
Esempio n. 11
0
        void csrmv(Param out,
                   const Param &values, const Param &rowIdx, const Param &colIdx,
                   const Param &rhs, const T alpha, const T beta)
        {
            bool use_alpha = (alpha != scalar<T>(1.0));
            bool use_beta = (beta != scalar<T>(0.0));

            // Using greedy indexing is causing performance issues on many platforms
            // FIXME: Figure out why
            bool use_greedy = false;

            // FIXME: Find a better number based on average non zeros per row
            int threads = 64;

            std::string ref_name =
                std::string("csrmv_") +
                std::string(dtype_traits<T>::getName()) +
                std::string("_") +
                std::to_string(use_alpha) +
                std::string("_") +
                std::to_string(use_beta) +
                std::string("_") +
                std::to_string(use_greedy) +
                std::string("_") +
                std::to_string(threads);

            int device = getActiveDeviceId();

            kc_entry_t entry = kernelCache(device, ref_name);

            if (entry.prog==0 && entry.ker==0) {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();
                options << " -D USE_ALPHA=" << use_alpha;
                options << " -D USE_BETA=" << use_beta;
                options << " -D USE_GREEDY=" << use_greedy;
                options << " -D THREADS=" << threads;

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                if (std::is_same<T, cfloat>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D IS_CPLX=1";
                } else {
                    options << " -D IS_CPLX=0";
                }

                const char *ker_strs[] = {csrmv_cl};
                const int   ker_lens[] = {csrmv_cl_len};

                Program prog;
                buildProgram(prog, 1, ker_strs, ker_lens, options.str());
                entry.prog = new Program(prog);
                entry.ker  = new Kernel[2];
                entry.ker[0] = Kernel(*entry.prog, "csrmv_thread");
                entry.ker[1] = Kernel(*entry.prog, "csrmv_block");

                addKernelToCache(device, ref_name, entry);
            }

            int count = 0;
            cl::Buffer *counter = bufferAlloc(sizeof(int));
            getQueue().enqueueWriteBuffer(*counter, CL_TRUE,
                                          0,
                                          sizeof(int),
                                          (void *)&count);

            // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block
            bool is_csrmv_block = true;
            auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0];
            auto csrmv_func = KernelFunctor<Buffer,
                                            Buffer, Buffer, Buffer,
                                            int,
                                            Buffer, KParam, T, T, Buffer>(csrmv_kernel);

            NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1);
            int M = rowIdx.info.dims[0] - 1;

            int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]);
            groups_x = std::min(groups_x, MAX_CSRMV_GROUPS);
            NDRange global(local[0] * groups_x, 1);

            csrmv_func(EnqueueArgs(getQueue(), global, local),
                        *out.data, *values.data, *rowIdx.data, *colIdx.data,
                        M, *rhs.data, rhs.info, alpha, beta, *counter);

            CL_DEBUG_FINISH(getQueue());
            bufferFree(counter);
        }
Esempio n. 12
0
void orb(unsigned* out_feat,
         Param& x_out,
         Param& y_out,
         Param& score_out,
         Param& ori_out,
         Param& size_out,
         Param& desc_out,
         Param image,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels,
         const bool blur_img)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static Program            orbProgs[DeviceManager::MAX_DEVICES];
        static Kernel             hrKernel[DeviceManager::MAX_DEVICES];
        static Kernel             kfKernel[DeviceManager::MAX_DEVICES];
        static Kernel             caKernel[DeviceManager::MAX_DEVICES];
        static Kernel             eoKernel[DeviceManager::MAX_DEVICES];

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D BLOCK_SIZE=" << ORB_THREADS_X;

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                buildProgram(orbProgs[device],
                             orb_cl,
                             orb_cl_len,
                             options.str());

                hrKernel[device] = Kernel(orbProgs[device], "harris_response");
                kfKernel[device] = Kernel(orbProgs[device], "keep_features");
                caKernel[device] = Kernel(orbProgs[device], "centroid_angle");
                eoKernel[device] = Kernel(orbProgs[device], "extract_orb");
            });

        unsigned patch_size = REF_PAT_SIZE;

        unsigned min_side = std::min(image.info.dims[0], image.info.dims[1]);
        unsigned max_levels = 0;
        float scl_sum = 0.f;
        for (unsigned i = 0; i < levels; i++) {
            min_side /= scl_fctr;

            // Minimum image side for a descriptor to be computed
            if (min_side < patch_size || max_levels == levels) break;

            max_levels++;
            scl_sum += 1.f / (float)pow(scl_fctr,(float)i);
        }

        std::vector<cl::Buffer*> d_x_pyr(max_levels);
        std::vector<cl::Buffer*> d_y_pyr(max_levels);
        std::vector<cl::Buffer*> d_score_pyr(max_levels);
        std::vector<cl::Buffer*> d_ori_pyr(max_levels);
        std::vector<cl::Buffer*> d_size_pyr(max_levels);
        std::vector<cl::Buffer*> d_desc_pyr(max_levels);

        std::vector<unsigned> feat_pyr(max_levels);
        unsigned total_feat = 0;

        // Compute number of features to keep for each level
        std::vector<unsigned> lvl_best(max_levels);
        unsigned feat_sum = 0;
        for (unsigned i = 0; i < max_levels-1; i++) {
            float lvl_scl = (float)pow(scl_fctr,(float)i);
            lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl);
            feat_sum += lvl_best[i];
        }
        lvl_best[max_levels-1] = max_feat - feat_sum;

        // Maintain a reference to previous level image
        Param prev_img;
        Param lvl_img;

        const unsigned gauss_len = 9;
        T* h_gauss = nullptr;
        Param gauss_filter;
        gauss_filter.data = nullptr;

        for (unsigned i = 0; i < max_levels; i++) {
            const float lvl_scl = (float)pow(scl_fctr,(float)i);

            if (i == 0) {
                // First level is used in its original size
                lvl_img = image;

                prev_img = image;
            }
            else if (i > 0) {
                // Resize previous level image to current level dimensions
                lvl_img.info.dims[0] = round(image.info.dims[0] / lvl_scl);
                lvl_img.info.dims[1] = round(image.info.dims[1] / lvl_scl);

                lvl_img.info.strides[0] = 1;
                lvl_img.info.strides[1] = lvl_img.info.dims[0];

                for (int k = 2; k < 4; k++) {
                    lvl_img.info.dims[k] = 1;
                    lvl_img.info.strides[k] = lvl_img.info.dims[k - 1] * lvl_img.info.strides[k - 1];
                }

                lvl_img.info.offset = 0;
                lvl_img.data = bufferAlloc(lvl_img.info.dims[3] * lvl_img.info.strides[3] * sizeof(T));

                resize<T, AF_INTERP_BILINEAR>(lvl_img, prev_img);

                if (i > 1)
                   bufferFree(prev_img.data);
                prev_img = lvl_img;
            }

            unsigned lvl_feat = 0;
            Param d_x_feat, d_y_feat, d_score_feat;

            // Round feature size to nearest odd integer
            float size = 2.f * floor(patch_size / 2.f) + 1.f;

            // Avoid keeping features that might be too wide and might not fit on
            // the image, sqrt(2.f) is the radius when angle is 45 degrees and
            // represents widest case possible
            unsigned edge = ceil(size * sqrt(2.f) / 2.f);

            // Detect FAST features
            fast<T, 9, true>(&lvl_feat, d_x_feat, d_y_feat, d_score_feat,
                             lvl_img, fast_thr, 0.15f, edge);

            if (lvl_feat == 0) {
                feat_pyr[i] = 0;

                if (i > 0 && i == max_levels-1)
                    bufferFree(lvl_img.data);

                continue;
            }

            bufferFree(d_score_feat.data);

            unsigned usable_feat = 0;
            cl::Buffer* d_usable_feat = bufferAlloc(sizeof(unsigned));
            getQueue().enqueueWriteBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat);

            cl::Buffer* d_x_harris = bufferAlloc(lvl_feat * sizeof(float));
            cl::Buffer* d_y_harris = bufferAlloc(lvl_feat * sizeof(float));
            cl::Buffer* d_score_harris = bufferAlloc(lvl_feat * sizeof(float));

            // Calculate Harris responses
            // Good block_size >= 7 (must be an odd number)
            const dim_type blk_x = divup(lvl_feat, ORB_THREADS_X);
            const NDRange local(ORB_THREADS_X, ORB_THREADS_Y);
            const NDRange global(blk_x * ORB_THREADS_X, ORB_THREADS_Y);

            unsigned block_size = 7;
            float k_thr = 0.04f;

            auto hrOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, const unsigned,
                                    Buffer, Buffer, KParam,
                                    const unsigned, const float, const unsigned> (hrKernel[device]);

            hrOp(EnqueueArgs(getQueue(), global, local),
                 *d_x_harris, *d_y_harris, *d_score_harris,
                 *d_x_feat.data, *d_y_feat.data, lvl_feat,
                 *d_usable_feat, *lvl_img.data, lvl_img.info,
                 block_size, k_thr, patch_size);
            CL_DEBUG_FINISH(getQueue());

            getQueue().enqueueReadBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat);

            bufferFree(d_x_feat.data);
            bufferFree(d_y_feat.data);
            bufferFree(d_usable_feat);

            if (usable_feat == 0) {
                feat_pyr[i] = 0;

                bufferFree(d_x_harris);
                bufferFree(d_y_harris);
                bufferFree(d_score_harris);

                if (i > 0 && i == max_levels-1)
                    bufferFree(lvl_img.data);

                continue;
            }

            // Sort features according to Harris responses
            Param d_harris_sorted;
            Param d_harris_idx;

            d_harris_sorted.info.dims[0] = usable_feat;
            d_harris_idx.info.dims[0] = usable_feat;
            d_harris_sorted.info.strides[0] = 1;
            d_harris_idx.info.strides[0] = 1;

            for (int k = 1; k < 4; k++) {
                d_harris_sorted.info.dims[k] = 1;
                d_harris_idx.info.dims[k] = 1;
                d_harris_sorted.info.strides[k] = d_harris_sorted.info.dims[k - 1] * d_harris_sorted.info.strides[k - 1];
                d_harris_idx.info.strides[k] = d_harris_idx.info.dims[k - 1] * d_harris_idx.info.strides[k - 1];
            }

            d_harris_sorted.info.offset = 0;
            d_harris_idx.info.offset = 0;
            d_harris_sorted.data = d_score_harris;
            d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned));

            sort0_index<float, false>(d_harris_sorted, d_harris_idx);

            cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_score_lvl = bufferAlloc(usable_feat * sizeof(float));

            usable_feat = min(usable_feat, lvl_best[i]);

            // Keep only features with higher Harris responses
            const dim_type keep_blk = divup(usable_feat, ORB_THREADS);
            const NDRange local_keep(ORB_THREADS, 1);
            const NDRange global_keep(keep_blk * ORB_THREADS, 1);

            auto kfOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, Buffer, Buffer,
                                    const unsigned> (kfKernel[device]);

            kfOp(EnqueueArgs(getQueue(), global_keep, local_keep),
                 *d_x_lvl, *d_y_lvl, *d_score_lvl,
                 *d_x_harris, *d_y_harris, *d_harris_sorted.data, *d_harris_idx.data,
                 usable_feat);
            CL_DEBUG_FINISH(getQueue());

            bufferFree(d_x_harris);
            bufferFree(d_y_harris);
            bufferFree(d_harris_sorted.data);
            bufferFree(d_harris_idx.data);

            cl::Buffer* d_ori_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_size_lvl = bufferAlloc(usable_feat * sizeof(float));

            // Compute orientation of features
            const dim_type centroid_blk_x = divup(usable_feat, ORB_THREADS_X);
            const NDRange local_centroid(ORB_THREADS_X, ORB_THREADS_Y);
            const NDRange global_centroid(centroid_blk_x * ORB_THREADS_X, ORB_THREADS_Y);

            auto caOp = make_kernel<Buffer, Buffer, Buffer,
                                    const unsigned, Buffer, KParam,
                                    const unsigned> (caKernel[device]);

            caOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                 *d_x_lvl, *d_y_lvl, *d_ori_lvl,
                 usable_feat, *lvl_img.data, lvl_img.info,
                 patch_size);
            CL_DEBUG_FINISH(getQueue());

            Param lvl_filt;
            Param lvl_tmp;

            if (blur_img) {
                lvl_filt = lvl_img;
                lvl_tmp = lvl_img;

                lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * lvl_filt.info.dims[1] * sizeof(T));
                lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * lvl_tmp.info.dims[1] * sizeof(T));

                // Calculate a separable Gaussian kernel
                if (h_gauss == nullptr) {
                    h_gauss = new T[gauss_len];
                    gaussian1D(h_gauss, gauss_len, 2.f);
                    gauss_filter.info.dims[0] = gauss_len;
                    gauss_filter.info.strides[0] = 1;

                    for (int k = 1; k < 4; k++) {
                        gauss_filter.info.dims[k] = 1;
                        gauss_filter.info.strides[k] = gauss_filter.info.dims[k - 1] * gauss_filter.info.strides[k - 1];
                    }

                    dim_type gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3];
                    gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T));
                    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
                }

                // Filter level image with Gaussian kernel to reduce noise sensitivity
                convolve2<T, convAccT, 0, false, gauss_len>(lvl_tmp, lvl_img, gauss_filter);
                convolve2<T, convAccT, 1, false, gauss_len>(lvl_filt, lvl_tmp, gauss_filter);

                bufferFree(lvl_tmp.data);
            }

            // Compute ORB descriptors
            cl::Buffer* d_desc_lvl = bufferAlloc(usable_feat * 8 * sizeof(unsigned));
            unsigned* h_desc_lvl = new unsigned[usable_feat * 8];
            for (int j = 0; j < (int)usable_feat * 8; j++)
                h_desc_lvl[j] = 0;
            getQueue().enqueueWriteBuffer(*d_desc_lvl, CL_TRUE, 0, usable_feat * 8 * sizeof(unsigned), h_desc_lvl);
            delete[] h_desc_lvl;

            auto eoOp = make_kernel<Buffer, const unsigned,
                                    Buffer, Buffer, Buffer, Buffer,
                                    Buffer, KParam,
                                    const float, const unsigned> (eoKernel[device]);

            if (blur_img) {
                eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                     *d_desc_lvl, usable_feat,
                     *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl,
                     *lvl_filt.data, lvl_filt.info,
                     lvl_scl, patch_size);
                CL_DEBUG_FINISH(getQueue());

                bufferFree(lvl_filt.data);
            }
            else {
                eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                     *d_desc_lvl, usable_feat,
                     *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl,
                     *lvl_img.data, lvl_img.info,
                     lvl_scl, patch_size);
                CL_DEBUG_FINISH(getQueue());
            }

            // Store results to pyramids
            total_feat += usable_feat;
            feat_pyr[i] = usable_feat;
            d_x_pyr[i] = d_x_lvl;
            d_y_pyr[i] = d_y_lvl;
            d_score_pyr[i] = d_score_lvl;
            d_ori_pyr[i] = d_ori_lvl;
            d_size_pyr[i] = d_size_lvl;
            d_desc_pyr[i] = d_desc_lvl;

            if (i > 0 && i == max_levels-1)
                bufferFree(lvl_img.data);
        }

        if (gauss_filter.data != nullptr)
            bufferFree(gauss_filter.data);
        if (h_gauss != nullptr)
            delete[] h_gauss;

        // If no features are found, set found features to 0 and return
        if (total_feat == 0) {
            *out_feat = 0;
            return;
        }

        // Allocate output memory
        x_out.info.dims[0] = total_feat;
        x_out.info.strides[0] = 1;
        y_out.info.dims[0] = total_feat;
        y_out.info.strides[0] = 1;
        score_out.info.dims[0] = total_feat;
        score_out.info.strides[0] = 1;
        ori_out.info.dims[0] = total_feat;
        ori_out.info.strides[0] = 1;
        size_out.info.dims[0] = total_feat;
        size_out.info.strides[0] = 1;

        desc_out.info.dims[0] = 8;
        desc_out.info.strides[0] = 1;
        desc_out.info.dims[1] = total_feat;
        desc_out.info.strides[1] = desc_out.info.dims[0];

        for (int k = 1; k < 4; k++) {
            x_out.info.dims[k] = 1;
            x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
            y_out.info.dims[k] = 1;
            y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
            score_out.info.dims[k] = 1;
            score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1];
            ori_out.info.dims[k] = 1;
            ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1];
            size_out.info.dims[k] = 1;
            size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1];
            if (k > 1) {
                desc_out.info.dims[k] = 1;
                desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1];
            }
        }

        if (total_feat > 0) {
            size_t out_sz  = total_feat * sizeof(float);
            x_out.data     = bufferAlloc(out_sz);
            y_out.data     = bufferAlloc(out_sz);
            score_out.data = bufferAlloc(out_sz);
            ori_out.data   = bufferAlloc(out_sz);
            size_out.data  = bufferAlloc(out_sz);

            size_t desc_sz = total_feat * 8 * sizeof(unsigned);
            desc_out.data  = bufferAlloc(desc_sz);
        }

        unsigned offset = 0;
        for (unsigned i = 0; i < max_levels; i++) {
            if (feat_pyr[i] == 0)
                continue;

            if (i > 0)
                offset += feat_pyr[i-1];

            getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));

            getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned));

            bufferFree(d_x_pyr[i]);
            bufferFree(d_y_pyr[i]);
            bufferFree(d_score_pyr[i]);
            bufferFree(d_ori_pyr[i]);
            bufferFree(d_size_pyr[i]);
            bufferFree(d_desc_pyr[i]);
        }

        // Sets number of output features
        *out_feat = total_feat;
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Esempio n. 13
0
void VkeCubeTexture::loadTextureFiles(const char **inPath){

	bool imagesOK = true;
	VKA_INFO_MSG("Loading Cube Texture.\n");
	for (uint32_t i = 0; i < 6; ++i){
		if (!loadTexture(inPath[i], NULL, NULL, &m_width, &m_height)){
			VKA_ERROR_MSG("Error loading texture image.\n");
			printf("Texture : %d not available (%s).\n", i, inPath[i]);
			return;
		}
	}

	VulkanDC::Device::Queue::Name queueName = "DEFAULT_GRAPHICS_QUEUE";
	VulkanDC::Device::Queue::CommandBufferID cmdID = INIT_COMMAND_ID;
	VulkanDC *dc = VulkanDC::Get();
	VulkanDC::Device *device = dc->getDefaultDevice();
	VulkanDC::Device::Queue *queue = device->getQueue(queueName);
	VkCommandBuffer cmd = VK_NULL_HANDLE;

	queue->beginCommandBuffer(cmdID, &cmd, VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);

	imageCreateAndBind(
		&m_data.image,
		&m_data.memory,
		m_format, VK_IMAGE_TYPE_2D,
		m_width, m_height, 1, 6,
		VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
		(VkImageUsageFlagBits)( VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT ),
		VK_IMAGE_TILING_OPTIMAL);

	VkBuffer cubeMapBuffer;
	VkDeviceMemory cubeMapMem;

	bufferCreate(&cubeMapBuffer, m_width*m_height * 4 * 6, VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
	bufferAlloc(&cubeMapBuffer, &cubeMapMem, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);

	VkDeviceSize dSize = m_width * m_height * 4;
	uint32_t rowPitch = m_width * 4;

	if (m_memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT){
		imageSetLayoutBarrier(cmdID, queueName, m_data.image, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_PREINITIALIZED, VK_IMAGE_LAYOUT_GENERAL);

		for (uint32_t i = 0; i < 6; ++i){

			void *data = NULL;
			VkDeviceSize ofst = dSize*i;

			VKA_CHECK_ERROR(vkMapMemory(getDefaultDevice(),cubeMapMem, ofst, dSize, 0, &data), "Could not map memory for image.\n");

			if (!loadTexture(inPath[i], (uint8_t**)&data, rowPitch, &m_width, &m_height)){
				VKA_ERROR_MSG("Could not load final image.\n");
			}

			vkUnmapMemory(getDefaultDevice(), cubeMapMem);
		}

		VkBufferImageCopy biCpyRgn[6];
			

		for (uint32_t k = 0; k < 6; ++k){
			VkDeviceSize ofst = dSize*k;

			biCpyRgn[k].bufferOffset = ofst;
			biCpyRgn[k].bufferImageHeight = 0;
			biCpyRgn[k].bufferRowLength = 0;
			biCpyRgn[k].imageExtent.width = m_width;
			biCpyRgn[k].imageExtent.height = m_height;
			biCpyRgn[k].imageExtent.depth = 1;
			biCpyRgn[k].imageOffset.x = 0;
			biCpyRgn[k].imageOffset.y = 0;
			biCpyRgn[k].imageOffset.z = 0;
			biCpyRgn[k].imageSubresource.baseArrayLayer = k;
			biCpyRgn[k].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
			biCpyRgn[k].imageSubresource.layerCount = 1;
			biCpyRgn[k].imageSubresource.mipLevel = 0;

		}

		VkFence copyFence;
		VkFenceCreateInfo fenceInfo;
		memset(&fenceInfo, 0, sizeof(fenceInfo));
		fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
		

		vkCreateFence(device->getVKDevice(), &fenceInfo,NULL , &copyFence);

		vkCmdCopyBufferToImage(cmd, cubeMapBuffer, m_data.image, m_data.imageLayout, 6, biCpyRgn);
		queue->flushCommandBuffer(cmdID , &copyFence);

		vkWaitForFences(device->getVKDevice(), 1, &copyFence, VK_TRUE, 100000000000);
		
		vkDestroyBuffer(device->getVKDevice(), cubeMapBuffer, NULL);
		vkFreeMemory(device->getVKDevice(), cubeMapMem, NULL);

	}


	VkSamplerCreateInfo sampler;
	
	sampler.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
	sampler.pNext = NULL;
	sampler.magFilter = VK_FILTER_NEAREST;
	sampler.minFilter = VK_FILTER_NEAREST;
	sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
    sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
	sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
	sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
	sampler.mipLodBias = 0.0f;
	sampler.maxAnisotropy = 1;
	sampler.compareOp = VK_COMPARE_OP_NEVER;
	sampler.minLod = 0.0f;
	sampler.maxLod = 0.0f;

	sampler.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE;

	VkImageViewCreateInfo view;
	view.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
	view.pNext = NULL;
	view.viewType = VK_IMAGE_VIEW_TYPE_CUBE; 
	view.format = m_format;
	view.components = {
		
		VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A
	};
	
	view.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 0 };
    view.subresourceRange.baseArrayLayer = 0;
	view.subresourceRange.levelCount = 1;
	view.subresourceRange.baseMipLevel = 0;
    view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
	view.subresourceRange.layerCount = 1;

	VKA_CHECK_ERROR(vkCreateSampler(getDefaultDevice(), &sampler,NULL, &m_data.sampler), "Could not create sampler for image texture.\n");

	view.image = m_data.image;

	VKA_CHECK_ERROR(vkCreateImageView(getDefaultDevice(), &view,NULL, &m_data.view), "Could not create image view for texture.\n");


	VKA_INFO_MSG("Created CUBE Image Texture.\n");

}
Esempio n. 14
0
void VkeCubeTexture::loadCubeDDS(const char *inFile){


	std::string searchPaths[] = {
		std::string(PROJECT_NAME),
		NVPWindow::sysExePath() + std::string(PROJECT_RELDIRECTORY),
		std::string(PROJECT_ABSDIRECTORY)
	};

	nv_dds::CDDSImage ddsImage;

	for (uint32_t i = 0; i < 3; ++i){
        std::string separator = "";
        uint32_t strSize = searchPaths[i].size();
        if(searchPaths[i].substr(strSize-1,strSize) != "/") separator = "/";
        std::string filePath = searchPaths[i] + separator + std::string("images/") + std::string(inFile);
        ddsImage.load(filePath, true);
		if (ddsImage.is_valid()) break;
	}

	if (!ddsImage.is_valid()){
		perror("Could not cube load texture image.\n");
		exit(1);
	}

	uint32_t imgW = ddsImage.get_width();
	uint32_t imgH = ddsImage.get_height();
	uint32_t comCount = ddsImage.get_components();
	uint32_t fmt = ddsImage.get_format();

	bool isCube = ddsImage.is_cubemap();
	bool isComp = ddsImage.is_compressed();

	VkFormat vkFmt = VK_FORMAT_R8G8B8A8_UNORM;

	switch (fmt){
	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
		vkFmt = VK_FORMAT_BC1_RGB_SRGB_BLOCK;
		break;

	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
		vkFmt = VK_FORMAT_BC2_UNORM_BLOCK;

		break;

	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
		vkFmt = VK_FORMAT_BC3_UNORM_BLOCK;
		break;
	default:

		break;
	}


	m_width = imgW;
	m_height = imgH;
	m_format = vkFmt;

	VulkanDC::Device::Queue::Name queueName = "DEFAULT_GRAPHICS_QUEUE";
	VulkanDC::Device::Queue::CommandBufferID cmdID = INIT_COMMAND_ID;
	VulkanDC *dc = VulkanDC::Get();
	VulkanDC::Device *device = dc->getDefaultDevice();
	VulkanDC::Device::Queue *queue = device->getQueue(queueName);
	VkCommandBuffer cmd = VK_NULL_HANDLE;

	queue->beginCommandBuffer(cmdID, &cmd, VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);

	imageCreateAndBind(
		&m_data.image,
		&m_data.memory,
		m_format, VK_IMAGE_TYPE_2D,
		m_width, m_height, 1, 6,
		VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
		(VkImageUsageFlagBits)(VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT),
		VK_IMAGE_TILING_OPTIMAL);

	VkBuffer cubeMapBuffer;
	VkDeviceMemory cubeMapMem;

	bufferCreate(&cubeMapBuffer, m_width*m_height * 3 * 6, VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
	bufferAlloc(&cubeMapBuffer, &cubeMapMem, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);


	if (m_memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT){
		imageSetLayoutBarrier(cmdID, queueName, m_data.image, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_PREINITIALIZED, VK_IMAGE_LAYOUT_GENERAL);

		for (uint32_t i = 0; i < 6; ++i){

			void *data = NULL;
			VkSubresourceLayout layout;
			VkImageSubresource subres;
			subres.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
			subres.mipLevel = m_mip_level;
			subres.arrayLayer = i;
			vkGetImageSubresourceLayout(getDefaultDevice(), m_data.image, &subres, &layout);


			VKA_CHECK_ERROR(vkMapMemory(getDefaultDevice(), cubeMapMem, layout.offset, layout.size, 0, &data), "Could not map memory for image.\n");

			const nv_dds::CTexture &mipmap = ddsImage.get_cubemap_face(i);

			memcpy(data, (void *)mipmap, layout.size);



			vkUnmapMemory(getDefaultDevice(), cubeMapMem);
		}

		VkBufferImageCopy biCpyRgn[6];


		for (uint32_t k = 0; k < 6; ++k){
			VkSubresourceLayout layout;
			VkImageSubresource subres;
			subres.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
			subres.mipLevel = m_mip_level;
			subres.arrayLayer = k;
			vkGetImageSubresourceLayout(getDefaultDevice(), m_data.image, &subres, &layout);

			biCpyRgn[k].bufferOffset = layout.offset;
			biCpyRgn[k].bufferImageHeight = 0;
			biCpyRgn[k].bufferRowLength = 0;
			biCpyRgn[k].imageExtent.width = m_width;
			biCpyRgn[k].imageExtent.height = m_height;
			biCpyRgn[k].imageExtent.depth = 1;
			biCpyRgn[k].imageOffset.x = 0;
			biCpyRgn[k].imageOffset.y = 0;
			biCpyRgn[k].imageOffset.z = 0;
			biCpyRgn[k].imageSubresource.baseArrayLayer = k;
			biCpyRgn[k].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
			biCpyRgn[k].imageSubresource.layerCount = 1;
			biCpyRgn[k].imageSubresource.mipLevel = 0;

		}

		VkFence copyFence;
		VkFenceCreateInfo fenceInfo;
		memset(&fenceInfo, 0, sizeof(fenceInfo));
		fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;


		vkCreateFence(device->getVKDevice(), &fenceInfo, NULL, &copyFence);

		vkCmdCopyBufferToImage(cmd, cubeMapBuffer, m_data.image, m_data.imageLayout, 6, biCpyRgn);
		queue->flushCommandBuffer(cmdID, &copyFence);

		vkWaitForFences(device->getVKDevice(), 1, &copyFence, VK_TRUE, 100000000000);

		vkDestroyBuffer(device->getVKDevice(), cubeMapBuffer, NULL);
		vkFreeMemory(device->getVKDevice(), cubeMapMem, NULL);

	}


	VkSamplerCreateInfo sampler;

	sampler.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
	sampler.pNext = NULL;
	sampler.magFilter = VK_FILTER_NEAREST;
	sampler.minFilter = VK_FILTER_NEAREST;
	sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
	sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
	sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
	sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
	sampler.mipLodBias = 0.0f;
	sampler.maxAnisotropy = 1;
	sampler.compareOp = VK_COMPARE_OP_NEVER;
	sampler.minLod = 0.0f;
	sampler.maxLod = 0.0f;

	sampler.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE;

	VkImageViewCreateInfo view;
	view.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
	view.pNext = NULL;
	view.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
	view.format = m_format;
	view.components.r = VK_COMPONENT_SWIZZLE_R;
	view.components.g = VK_COMPONENT_SWIZZLE_G;
	view.components.b = VK_COMPONENT_SWIZZLE_B;
	view.components.a = VK_COMPONENT_SWIZZLE_A;

	view.subresourceRange.baseArrayLayer = 0;
	view.subresourceRange.levelCount = 1;
	view.subresourceRange.baseMipLevel = 0;
	view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
	view.subresourceRange.layerCount = 1;

	VKA_CHECK_ERROR(vkCreateSampler(getDefaultDevice(), &sampler, NULL, &m_data.sampler), "Could not create sampler for image texture.\n");

	view.image = m_data.image;

	VKA_CHECK_ERROR(vkCreateImageView(getDefaultDevice(), &view, NULL, &m_data.view), "Could not create image view for texture.\n");





}
Esempio n. 15
0
void csrmm_nt(Param out, const Param &values, const Param &rowIdx,
              const Param &colIdx, const Param &rhs, const T alpha,
              const T beta) {
    bool use_alpha = (alpha != scalar<T>(1.0));
    bool use_beta  = (beta != scalar<T>(0.0));

    // Using greedy indexing is causing performance issues on many platforms
    // FIXME: Figure out why
    bool use_greedy = false;

    std::string ref_name = std::string("csrmm_nt_") +
                           std::string(dtype_traits<T>::getName()) +
                           std::string("_") + std::to_string(use_alpha) +
                           std::string("_") + std::to_string(use_beta) +
                           std::string("_") + std::to_string(use_greedy);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();
        options << " -D USE_ALPHA=" << use_alpha;
        options << " -D USE_BETA=" << use_beta;
        options << " -D USE_GREEDY=" << use_greedy;
        options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP;

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }
        if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) {
            options << " -D IS_CPLX=1";
        } else {
            options << " -D IS_CPLX=0";
        }

        const char *ker_strs[] = {csrmm_cl};
        const int ker_lens[]   = {csrmm_cl_len};

        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog   = new Program(prog);
        entry.ker    = new Kernel[2];
        entry.ker[0] = Kernel(*entry.prog, "csrmm_nt");
        // FIXME: Change this after adding another kernel
        entry.ker[1] = Kernel(*entry.prog, "csrmm_nt");

        addKernelToCache(device, ref_name, entry);
    }

    auto csrmm_nt_kernel = entry.ker[0];
    auto csrmm_nt_func =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam,
                      T, T, Buffer>(csrmm_nt_kernel);
    NDRange local(THREADS_PER_GROUP, 1);
    int M = rowIdx.info.dims[0] - 1;
    int N = rhs.info.dims[0];

    int groups_x = divup(N, local[0]);
    int groups_y = divup(M, REPEAT);
    groups_y     = std::min(groups_y, MAX_CSRMM_GROUPS);
    NDRange global(local[0] * groups_x, local[1] * groups_y);

    std::vector<int> count(groups_x);
    cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int));
    getQueue().enqueueWriteBuffer(
        *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data());

    csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data,
                  *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data,
                  rhs.info, alpha, beta, *counter);

    bufferFree(counter);
}
Esempio n. 16
0
static void where(Param &out, Param &in)
{
    uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0]));
    threads_x = std::min(threads_x, THREADS_PER_GROUP);
    uint threads_y = THREADS_PER_GROUP / threads_x;

    uint groups_x = divup(in.info.dims[0], threads_x * REPEAT);
    uint groups_y = divup(in.info.dims[1], threads_y);

    Param rtmp;
    Param otmp;

    rtmp.info.dims[0] = groups_x;
    otmp.info.dims[0] = in.info.dims[0];

    rtmp.info.strides[0] = 1;
    otmp.info.strides[0] = 1;

    rtmp.info.offset = 0;
    otmp.info.offset = 0;

    for (int k = 1; k < 4; k++) {
        rtmp.info.dims[k] = in.info.dims[k];
        rtmp.info.strides[k] = rtmp.info.strides[k - 1] * rtmp.info.dims[k - 1];

        otmp.info.dims[k] = in.info.dims[k];
        otmp.info.strides[k] = otmp.info.strides[k - 1] * otmp.info.dims[k - 1];
    }

    int rtmp_elements = rtmp.info.strides[3] * rtmp.info.dims[3];
    rtmp.data = bufferAlloc(rtmp_elements * sizeof(uint));

    int otmp_elements = otmp.info.strides[3] * otmp.info.dims[3];
    otmp.data = bufferAlloc(otmp_elements * sizeof(uint));

    scan_first_launcher<T, uint, af_notzero_t>(otmp, rtmp, in, false, groups_x, groups_y, threads_x);

    // Linearize the dimensions and perform scan
    Param ltmp = rtmp;
    ltmp.info.offset = 0;
    ltmp.info.dims[0] = rtmp_elements;
    for (int k = 1; k < 4; k++) {
        ltmp.info.dims[k] = 1;
        ltmp.info.strides[k] = rtmp_elements;
    }

    scan_first<uint, uint, af_add_t>(ltmp, ltmp);

    // Get output size and allocate output
    uint total;
    getQueue().enqueueReadBuffer(*rtmp.data, CL_TRUE,
                                  sizeof(uint) * (rtmp_elements - 1),
                                  sizeof(uint),
                                  &total);

    out.data = bufferAlloc(total * sizeof(uint));

    out.info.dims[0] = total;
    out.info.strides[0] = 1;
    for (int k = 1; k < 4; k++) {
        out.info.dims[k] = 1;
        out.info.strides[k] = total;
    }

    if (total > 0)
        get_out_idx<T>(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y);

    bufferFree(rtmp.data);
    bufferFree(otmp.data);
}
void vkeGameRendererDynamic::initIndirectCommands(){

	if (!m_node_data) return;

	VulkanDC *dc = VulkanDC::Get();
	VulkanDC::Device *device = dc->getDefaultDevice();
	VulkanDC::Device::Queue *queue = dc->getDefaultQueue();

	uint32_t cnt = m_node_data->count();
	uint32_t sz = sizeof(VkDrawIndexedIndirectCommand)*cnt;

	VkBuffer sceneIndirectStaging;
	VkDeviceMemory sceneIndirectMemStaging;

	VkBufferUsageFlags usageFlags = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;

	bufferCreate(&m_scene_indirect_buffer, sz, (VkBufferUsageFlagBits)usageFlags);
	bufferAlloc(&m_scene_indirect_buffer, &m_scene_indirect_memory, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

	usageFlags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

	bufferCreate(&sceneIndirectStaging, sz, (VkBufferUsageFlagBits)usageFlags);
	bufferAlloc(&sceneIndirectStaging, &sceneIndirectMemStaging, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);

	VkDrawIndexedIndirectCommand *commands = NULL;

	VKA_CHECK_ERROR(vkMapMemory(device->getVKDevice(), sceneIndirectMemStaging, 0, sz, 0, (void **)&commands), "Could not map indirect buffer memory.\n");

	for (uint32_t i = 0; i < cnt; ++i){
		VkeMesh *mesh = m_node_data->getData(i)->getMesh();
		commands[i].firstIndex = mesh->getFirstIndex();
		commands[i].firstInstance = i*m_instance_count;
		commands[i].vertexOffset = mesh->getFirstVertex();
		commands[i].indexCount = mesh->getIndexCount();
		commands[i].instanceCount = m_instance_count;
	}

	vkUnmapMemory(device->getVKDevice(), sceneIndirectMemStaging);

	VkBufferCopy bufCpy;
	bufCpy.dstOffset = 0;
	bufCpy.srcOffset = 0;
	bufCpy.size = sz;

	VkCommandBuffer copyCmd;
	VkCommandBufferAllocateInfo cmdBufInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
	cmdBufInfo.commandBufferCount = 1;
	cmdBufInfo.commandPool = queue->getCommandPool();
	cmdBufInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;

	VKA_CHECK_ERROR(vkAllocateCommandBuffers(device->getVKDevice(), &cmdBufInfo, &copyCmd), "Could not allocate command buffers.\n");

	VkCommandBufferBeginInfo cmdBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
	cmdBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;

	VKA_CHECK_ERROR(vkBeginCommandBuffer(copyCmd, &cmdBeginInfo), "Could not begin commmand buffer.\n");

	vkCmdCopyBuffer(copyCmd, sceneIndirectStaging, m_scene_indirect_buffer, 1, &bufCpy);

	VKA_CHECK_ERROR(vkEndCommandBuffer(copyCmd), "Could not end command buffer.\n");

	VkSubmitInfo subInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
	subInfo.commandBufferCount = 1;
	subInfo.pCommandBuffers = &copyCmd;

	VkFence theFence;
	VkFenceCreateInfo fenceInfo = { VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };

	VKA_CHECK_ERROR(vkCreateFence(device->getVKDevice(), &fenceInfo, NULL, &theFence), "Could not create fence.\n");
	VKA_CHECK_ERROR(vkQueueSubmit(queue->getVKQueue(), 1, &subInfo, theFence), "Could not submit queue for indirect buffer copy.\n");
	VKA_CHECK_ERROR(vkWaitForFences(device->getVKDevice(), 1, &theFence, VK_TRUE, UINT_MAX), "Could not wait for fence.\n");

	vkFreeCommandBuffers(device->getVKDevice(), queue->getCommandPool(), 1, &copyCmd);
	vkDestroyFence(device->getVKDevice(), theFence, NULL);

}
Esempio n. 18
0
void fast(unsigned* out_feat,
          Param &x_out,
          Param &y_out,
          Param &score_out,
          Param in,
          const float thr,
          const float feature_ratio,
          const unsigned edge)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> fastProgs;
        static std::map<int, Kernel*>  lfKernel;
        static std::map<int, Kernel*>  nmKernel;
        static std::map<int, Kernel*>  gfKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D ARC_LENGTH=" << arc_length
                        << " -D NONMAX=" << static_cast<unsigned>(nonmax);

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog, fast_cl, fast_cl_len, options.str());
                fastProgs[device] = new Program(prog);

                lfKernel[device] = new Kernel(*fastProgs[device], "locate_features");
                nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts");
                gfKernel[device] = new Kernel(*fastProgs[device], "get_features");
            });

        const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio);

        // Matrix containing scores for detected features, scores are stored in the
        // same coordinates as features, dimensions should be equal to in.
        cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float));
        std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0);
        getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]);

        cl::Buffer *d_flags = d_score;
        if (nonmax) {
            d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T));
        }

        const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X);
        const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y);

        // Locate features kernel sizes
        const NDRange local(FAST_THREADS_X, FAST_THREADS_Y);
        const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y);

        auto lfOp = make_kernel<Buffer, KParam,
                                Buffer, const float, const unsigned,
                                LocalSpaceArg> (*lfKernel[device]);

        lfOp(EnqueueArgs(getQueue(), global, local),
             *in.data, in.info, *d_score, thr, edge,
             cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T)));
        CL_DEBUG_FINISH(getQueue());

        const int blk_nonmax_x = divup(in.info.dims[0], 64);
        const int blk_nonmax_y = divup(in.info.dims[1], 64);

        // Nonmax kernel sizes
        const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y);
        const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y);

        unsigned count_init = 0;
        cl::Buffer *d_total = bufferAlloc(sizeof(unsigned));
        getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init);

        //size_t *global_nonmax_dims = global_nonmax();
        size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned);
        cl::Buffer *d_counts  = bufferAlloc(blocks_sz);
        cl::Buffer *d_offsets = bufferAlloc(blocks_sz);

        auto nmOp = make_kernel<Buffer, Buffer, Buffer,
                                Buffer, Buffer,
                                KParam, const unsigned> (*nmKernel[device]);
        nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax),
                         *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge);
        CL_DEBUG_FINISH(getQueue());

        unsigned total;
        getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total);
        total = total < max_feat ? total : max_feat;

        if (total > 0) {
            size_t out_sz = total * sizeof(float);
            x_out.data = bufferAlloc(out_sz);
            y_out.data = bufferAlloc(out_sz);
            score_out.data = bufferAlloc(out_sz);

            auto gfOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, Buffer,
                                    KParam, const unsigned,
                                    const unsigned> (*gfKernel[device]);
            gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax),
                             *x_out.data, *y_out.data, *score_out.data,
                             *d_flags, *d_counts, *d_offsets,
                             in.info, total, edge);
            CL_DEBUG_FINISH(getQueue());
        }

        *out_feat = total;

        x_out.info.dims[0] = total;
        x_out.info.strides[0] = 1;
        y_out.info.dims[0] = total;
        y_out.info.strides[0] = 1;
        score_out.info.dims[0] = total;
        score_out.info.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            x_out.info.dims[k] = 1;
            x_out.info.strides[k] = total;
            y_out.info.dims[k] = 1;
            y_out.info.strides[k] = total;
            score_out.info.dims[k] = 1;
            score_out.info.strides[k] = total;
        }

        bufferFree(d_score);
        if (nonmax) bufferFree(d_flags);
        bufferFree(d_total);
        bufferFree(d_counts);
        bufferFree(d_offsets);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Esempio n. 19
0
void morph(Param         out,
        const Param      in,
        const Param      mask)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> morProgs;
        static std::map<int, Kernel*> morKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                ToNumStr<T> toNumStr;
                T init = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init();
                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D isDilation="<< isDilation
                        << " -D init=" << toNumStr(init)
                        << " -D windLen=" << windLen;
                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                Program prog;
                buildProgram(prog, morph_cl, morph_cl_len, options.str());
                morProgs[device]   = new Program(prog);
                morKernels[device] = new Kernel(*morProgs[device], "morph");
            });

        auto morphOp = KernelFunctor<Buffer, KParam,
                                   Buffer, KParam,
                                   Buffer, cl::LocalSpaceArg,
                                   int, int
                                  >(*morKernels[device]);

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(in.info.dims[0], THREADS_X);
        int blk_y = divup(in.info.dims[1], THREADS_Y);
        // launch batch * blk_x blocks along x dimension
        NDRange global(blk_x * THREADS_X * in.info.dims[2],
                       blk_y * THREADS_Y * in.info.dims[3]);

        // copy mask/filter to constant memory
        cl_int se_size   = sizeof(T)*windLen*windLen;
        cl::Buffer *mBuff = bufferAlloc(se_size);
        getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size);

        // calculate shared memory size
        const int halo    = windLen/2;
        const int padding = 2*halo;
        const int locLen  = THREADS_X + padding + 1;
        const int locSize = locLen * (THREADS_Y+padding);

        morphOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info, *in.data, in.info, *mBuff,
                cl::Local(locSize*sizeof(T)), blk_x, blk_y);

        bufferFree(mBuff);

        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Esempio n. 20
0
void nearest_neighbour(Param idx,
                       Param dist,
                       Param query,
                       Param train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    try {
        const unsigned feat_len = query.info.dims[dist_dim];
        const To max_dist = maxval<To>();

        // Determine maximum feat_len capable of using shared memory (faster)
        cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
        size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
        size_t ltrain_sz = THREADS * feat_len * sizeof(T);
        bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false;
        size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef;

        unsigned unroll_len = nextpow2(feat_len);
        if (unroll_len != feat_len) unroll_len = 0;

        std::string ref_name =
            std::string("knn_") +
            std::to_string(dist_type) +
            std::string("_") +
            std::to_string(use_lmem) +
            std::string("_") +
            std::string(dtype_traits<T>::getName()) +
            std::string("_") +
            std::to_string(unroll_len);

        int device = getActiveDeviceId();
        kc_t::iterator cache_idx = kernelCaches[device].find(ref_name);

        kc_entry_t entry;
        if (cache_idx == kernelCaches[device].end()) {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D To=" << dtype_traits<To>::getName()
                        << " -D THREADS=" << THREADS
                        << " -D FEAT_LEN=" << unroll_len;

                switch(dist_type) {
                    case AF_SAD: options <<" -D DISTOP=_sad_"; break;
                    case AF_SSD: options <<" -D DISTOP=_ssd_"; break;
                    case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__";
                                 break;
                    default: break;
                }

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                if (use_lmem)
                    options << " -D USE_LOCAL_MEM";

                cl::Program prog;
                buildProgram(prog,
                             nearest_neighbour_cl,
                             nearest_neighbour_cl_len,
                             options.str());

                entry.prog = new Program(prog);
                entry.ker = new Kernel[3];

                entry.ker[0] = Kernel(*entry.prog, "nearest_neighbour_unroll");
                entry.ker[1] = Kernel(*entry.prog, "nearest_neighbour");
                entry.ker[2] = Kernel(*entry.prog, "select_matches");

                kernelCaches[device][ref_name] = entry;
        } else {
            entry = cache_idx->second;
        }

        const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

        const unsigned nquery = query.info.dims[sample_dim];
        const unsigned ntrain = train.info.dims[sample_dim];

        unsigned nblk = divup(ntrain, THREADS);
        const NDRange local(THREADS, 1);
        const NDRange global(nblk * THREADS, 1);

        cl::Buffer *d_blk_idx  = bufferAlloc(nblk * nquery * sizeof(unsigned));
        cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To));

        // For each query vector, find training vector with smallest Hamming
        // distance per CUDA block
        if (unroll_len > 0) {
            auto huOp = KernelFunctor<Buffer, Buffer,
                                    Buffer, KParam,
                                    Buffer, KParam,
                                    const To,
                                    LocalSpaceArg> (entry.ker[0]);

            huOp(EnqueueArgs(getQueue(), global, local),
                 *d_blk_idx, *d_blk_dist,
                 *query.data, query.info, *train.data, train.info,
                 max_dist, cl::Local(lmem_sz));
        }
        else {
            auto hmOp = KernelFunctor<Buffer, Buffer,
                                    Buffer, KParam,
                                    Buffer, KParam,
                                    const To, const unsigned,
                                    LocalSpaceArg> (entry.ker[1]);

            hmOp(EnqueueArgs(getQueue(), global, local),
                 *d_blk_idx, *d_blk_dist,
                 *query.data, query.info, *train.data, train.info,
                 max_dist, feat_len, cl::Local(lmem_sz));
        }
        CL_DEBUG_FINISH(getQueue());

        const NDRange local_sm(32, 8);
        const NDRange global_sm(divup(nquery, 32) * 32, 8);

        // Reduce all smallest Hamming distances from each block and store final
        // best match
        auto smOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer,
                                const unsigned, const unsigned,
                                const To> (entry.ker[2]);

        smOp(EnqueueArgs(getQueue(), global_sm, local_sm),
             *idx.data, *dist.data,
             *d_blk_idx, *d_blk_dist,
             nquery, nblk, max_dist);
        CL_DEBUG_FINISH(getQueue());

        bufferFree(d_blk_idx);
        bufferFree(d_blk_dist);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Esempio n. 21
0
 T *memAlloc(const size_t &elements)
 {
     managerInit();
     return (T *)bufferAlloc(elements * sizeof(T));
 }