Example #1
0
void orb(unsigned* out_feat,
         Param& x_out,
         Param& y_out,
         Param& score_out,
         Param& ori_out,
         Param& size_out,
         Param& desc_out,
         Param image,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels,
         const bool blur_img)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static Program            orbProgs[DeviceManager::MAX_DEVICES];
        static Kernel             hrKernel[DeviceManager::MAX_DEVICES];
        static Kernel             kfKernel[DeviceManager::MAX_DEVICES];
        static Kernel             caKernel[DeviceManager::MAX_DEVICES];
        static Kernel             eoKernel[DeviceManager::MAX_DEVICES];

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D BLOCK_SIZE=" << ORB_THREADS_X;

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                buildProgram(orbProgs[device],
                             orb_cl,
                             orb_cl_len,
                             options.str());

                hrKernel[device] = Kernel(orbProgs[device], "harris_response");
                kfKernel[device] = Kernel(orbProgs[device], "keep_features");
                caKernel[device] = Kernel(orbProgs[device], "centroid_angle");
                eoKernel[device] = Kernel(orbProgs[device], "extract_orb");
            });

        unsigned patch_size = REF_PAT_SIZE;

        unsigned min_side = std::min(image.info.dims[0], image.info.dims[1]);
        unsigned max_levels = 0;
        float scl_sum = 0.f;
        for (unsigned i = 0; i < levels; i++) {
            min_side /= scl_fctr;

            // Minimum image side for a descriptor to be computed
            if (min_side < patch_size || max_levels == levels) break;

            max_levels++;
            scl_sum += 1.f / (float)pow(scl_fctr,(float)i);
        }

        std::vector<cl::Buffer*> d_x_pyr(max_levels);
        std::vector<cl::Buffer*> d_y_pyr(max_levels);
        std::vector<cl::Buffer*> d_score_pyr(max_levels);
        std::vector<cl::Buffer*> d_ori_pyr(max_levels);
        std::vector<cl::Buffer*> d_size_pyr(max_levels);
        std::vector<cl::Buffer*> d_desc_pyr(max_levels);

        std::vector<unsigned> feat_pyr(max_levels);
        unsigned total_feat = 0;

        // Compute number of features to keep for each level
        std::vector<unsigned> lvl_best(max_levels);
        unsigned feat_sum = 0;
        for (unsigned i = 0; i < max_levels-1; i++) {
            float lvl_scl = (float)pow(scl_fctr,(float)i);
            lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl);
            feat_sum += lvl_best[i];
        }
        lvl_best[max_levels-1] = max_feat - feat_sum;

        // Maintain a reference to previous level image
        Param prev_img;
        Param lvl_img;

        const unsigned gauss_len = 9;
        T* h_gauss = nullptr;
        Param gauss_filter;
        gauss_filter.data = nullptr;

        for (unsigned i = 0; i < max_levels; i++) {
            const float lvl_scl = (float)pow(scl_fctr,(float)i);

            if (i == 0) {
                // First level is used in its original size
                lvl_img = image;

                prev_img = image;
            }
            else if (i > 0) {
                // Resize previous level image to current level dimensions
                lvl_img.info.dims[0] = round(image.info.dims[0] / lvl_scl);
                lvl_img.info.dims[1] = round(image.info.dims[1] / lvl_scl);

                lvl_img.info.strides[0] = 1;
                lvl_img.info.strides[1] = lvl_img.info.dims[0];

                for (int k = 2; k < 4; k++) {
                    lvl_img.info.dims[k] = 1;
                    lvl_img.info.strides[k] = lvl_img.info.dims[k - 1] * lvl_img.info.strides[k - 1];
                }

                lvl_img.info.offset = 0;
                lvl_img.data = bufferAlloc(lvl_img.info.dims[3] * lvl_img.info.strides[3] * sizeof(T));

                resize<T, AF_INTERP_BILINEAR>(lvl_img, prev_img);

                if (i > 1)
                   bufferFree(prev_img.data);
                prev_img = lvl_img;
            }

            unsigned lvl_feat = 0;
            Param d_x_feat, d_y_feat, d_score_feat;

            // Round feature size to nearest odd integer
            float size = 2.f * floor(patch_size / 2.f) + 1.f;

            // Avoid keeping features that might be too wide and might not fit on
            // the image, sqrt(2.f) is the radius when angle is 45 degrees and
            // represents widest case possible
            unsigned edge = ceil(size * sqrt(2.f) / 2.f);

            // Detect FAST features
            fast<T, 9, true>(&lvl_feat, d_x_feat, d_y_feat, d_score_feat,
                             lvl_img, fast_thr, 0.15f, edge);

            if (lvl_feat == 0) {
                feat_pyr[i] = 0;

                if (i > 0 && i == max_levels-1)
                    bufferFree(lvl_img.data);

                continue;
            }

            bufferFree(d_score_feat.data);

            unsigned usable_feat = 0;
            cl::Buffer* d_usable_feat = bufferAlloc(sizeof(unsigned));
            getQueue().enqueueWriteBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat);

            cl::Buffer* d_x_harris = bufferAlloc(lvl_feat * sizeof(float));
            cl::Buffer* d_y_harris = bufferAlloc(lvl_feat * sizeof(float));
            cl::Buffer* d_score_harris = bufferAlloc(lvl_feat * sizeof(float));

            // Calculate Harris responses
            // Good block_size >= 7 (must be an odd number)
            const dim_type blk_x = divup(lvl_feat, ORB_THREADS_X);
            const NDRange local(ORB_THREADS_X, ORB_THREADS_Y);
            const NDRange global(blk_x * ORB_THREADS_X, ORB_THREADS_Y);

            unsigned block_size = 7;
            float k_thr = 0.04f;

            auto hrOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, const unsigned,
                                    Buffer, Buffer, KParam,
                                    const unsigned, const float, const unsigned> (hrKernel[device]);

            hrOp(EnqueueArgs(getQueue(), global, local),
                 *d_x_harris, *d_y_harris, *d_score_harris,
                 *d_x_feat.data, *d_y_feat.data, lvl_feat,
                 *d_usable_feat, *lvl_img.data, lvl_img.info,
                 block_size, k_thr, patch_size);
            CL_DEBUG_FINISH(getQueue());

            getQueue().enqueueReadBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat);

            bufferFree(d_x_feat.data);
            bufferFree(d_y_feat.data);
            bufferFree(d_usable_feat);

            if (usable_feat == 0) {
                feat_pyr[i] = 0;

                bufferFree(d_x_harris);
                bufferFree(d_y_harris);
                bufferFree(d_score_harris);

                if (i > 0 && i == max_levels-1)
                    bufferFree(lvl_img.data);

                continue;
            }

            // Sort features according to Harris responses
            Param d_harris_sorted;
            Param d_harris_idx;

            d_harris_sorted.info.dims[0] = usable_feat;
            d_harris_idx.info.dims[0] = usable_feat;
            d_harris_sorted.info.strides[0] = 1;
            d_harris_idx.info.strides[0] = 1;

            for (int k = 1; k < 4; k++) {
                d_harris_sorted.info.dims[k] = 1;
                d_harris_idx.info.dims[k] = 1;
                d_harris_sorted.info.strides[k] = d_harris_sorted.info.dims[k - 1] * d_harris_sorted.info.strides[k - 1];
                d_harris_idx.info.strides[k] = d_harris_idx.info.dims[k - 1] * d_harris_idx.info.strides[k - 1];
            }

            d_harris_sorted.info.offset = 0;
            d_harris_idx.info.offset = 0;
            d_harris_sorted.data = d_score_harris;
            d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned));

            sort0_index<float, false>(d_harris_sorted, d_harris_idx);

            cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_score_lvl = bufferAlloc(usable_feat * sizeof(float));

            usable_feat = min(usable_feat, lvl_best[i]);

            // Keep only features with higher Harris responses
            const dim_type keep_blk = divup(usable_feat, ORB_THREADS);
            const NDRange local_keep(ORB_THREADS, 1);
            const NDRange global_keep(keep_blk * ORB_THREADS, 1);

            auto kfOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, Buffer, Buffer,
                                    const unsigned> (kfKernel[device]);

            kfOp(EnqueueArgs(getQueue(), global_keep, local_keep),
                 *d_x_lvl, *d_y_lvl, *d_score_lvl,
                 *d_x_harris, *d_y_harris, *d_harris_sorted.data, *d_harris_idx.data,
                 usable_feat);
            CL_DEBUG_FINISH(getQueue());

            bufferFree(d_x_harris);
            bufferFree(d_y_harris);
            bufferFree(d_harris_sorted.data);
            bufferFree(d_harris_idx.data);

            cl::Buffer* d_ori_lvl = bufferAlloc(usable_feat * sizeof(float));
            cl::Buffer* d_size_lvl = bufferAlloc(usable_feat * sizeof(float));

            // Compute orientation of features
            const dim_type centroid_blk_x = divup(usable_feat, ORB_THREADS_X);
            const NDRange local_centroid(ORB_THREADS_X, ORB_THREADS_Y);
            const NDRange global_centroid(centroid_blk_x * ORB_THREADS_X, ORB_THREADS_Y);

            auto caOp = make_kernel<Buffer, Buffer, Buffer,
                                    const unsigned, Buffer, KParam,
                                    const unsigned> (caKernel[device]);

            caOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                 *d_x_lvl, *d_y_lvl, *d_ori_lvl,
                 usable_feat, *lvl_img.data, lvl_img.info,
                 patch_size);
            CL_DEBUG_FINISH(getQueue());

            Param lvl_filt;
            Param lvl_tmp;

            if (blur_img) {
                lvl_filt = lvl_img;
                lvl_tmp = lvl_img;

                lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * lvl_filt.info.dims[1] * sizeof(T));
                lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * lvl_tmp.info.dims[1] * sizeof(T));

                // Calculate a separable Gaussian kernel
                if (h_gauss == nullptr) {
                    h_gauss = new T[gauss_len];
                    gaussian1D(h_gauss, gauss_len, 2.f);
                    gauss_filter.info.dims[0] = gauss_len;
                    gauss_filter.info.strides[0] = 1;

                    for (int k = 1; k < 4; k++) {
                        gauss_filter.info.dims[k] = 1;
                        gauss_filter.info.strides[k] = gauss_filter.info.dims[k - 1] * gauss_filter.info.strides[k - 1];
                    }

                    dim_type gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3];
                    gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T));
                    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
                }

                // Filter level image with Gaussian kernel to reduce noise sensitivity
                convolve2<T, convAccT, 0, false, gauss_len>(lvl_tmp, lvl_img, gauss_filter);
                convolve2<T, convAccT, 1, false, gauss_len>(lvl_filt, lvl_tmp, gauss_filter);

                bufferFree(lvl_tmp.data);
            }

            // Compute ORB descriptors
            cl::Buffer* d_desc_lvl = bufferAlloc(usable_feat * 8 * sizeof(unsigned));
            unsigned* h_desc_lvl = new unsigned[usable_feat * 8];
            for (int j = 0; j < (int)usable_feat * 8; j++)
                h_desc_lvl[j] = 0;
            getQueue().enqueueWriteBuffer(*d_desc_lvl, CL_TRUE, 0, usable_feat * 8 * sizeof(unsigned), h_desc_lvl);
            delete[] h_desc_lvl;

            auto eoOp = make_kernel<Buffer, const unsigned,
                                    Buffer, Buffer, Buffer, Buffer,
                                    Buffer, KParam,
                                    const float, const unsigned> (eoKernel[device]);

            if (blur_img) {
                eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                     *d_desc_lvl, usable_feat,
                     *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl,
                     *lvl_filt.data, lvl_filt.info,
                     lvl_scl, patch_size);
                CL_DEBUG_FINISH(getQueue());

                bufferFree(lvl_filt.data);
            }
            else {
                eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid),
                     *d_desc_lvl, usable_feat,
                     *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl,
                     *lvl_img.data, lvl_img.info,
                     lvl_scl, patch_size);
                CL_DEBUG_FINISH(getQueue());
            }

            // Store results to pyramids
            total_feat += usable_feat;
            feat_pyr[i] = usable_feat;
            d_x_pyr[i] = d_x_lvl;
            d_y_pyr[i] = d_y_lvl;
            d_score_pyr[i] = d_score_lvl;
            d_ori_pyr[i] = d_ori_lvl;
            d_size_pyr[i] = d_size_lvl;
            d_desc_pyr[i] = d_desc_lvl;

            if (i > 0 && i == max_levels-1)
                bufferFree(lvl_img.data);
        }

        if (gauss_filter.data != nullptr)
            bufferFree(gauss_filter.data);
        if (h_gauss != nullptr)
            delete[] h_gauss;

        // If no features are found, set found features to 0 and return
        if (total_feat == 0) {
            *out_feat = 0;
            return;
        }

        // Allocate output memory
        x_out.info.dims[0] = total_feat;
        x_out.info.strides[0] = 1;
        y_out.info.dims[0] = total_feat;
        y_out.info.strides[0] = 1;
        score_out.info.dims[0] = total_feat;
        score_out.info.strides[0] = 1;
        ori_out.info.dims[0] = total_feat;
        ori_out.info.strides[0] = 1;
        size_out.info.dims[0] = total_feat;
        size_out.info.strides[0] = 1;

        desc_out.info.dims[0] = 8;
        desc_out.info.strides[0] = 1;
        desc_out.info.dims[1] = total_feat;
        desc_out.info.strides[1] = desc_out.info.dims[0];

        for (int k = 1; k < 4; k++) {
            x_out.info.dims[k] = 1;
            x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
            y_out.info.dims[k] = 1;
            y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
            score_out.info.dims[k] = 1;
            score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1];
            ori_out.info.dims[k] = 1;
            ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1];
            size_out.info.dims[k] = 1;
            size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1];
            if (k > 1) {
                desc_out.info.dims[k] = 1;
                desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1];
            }
        }

        if (total_feat > 0) {
            size_t out_sz  = total_feat * sizeof(float);
            x_out.data     = bufferAlloc(out_sz);
            y_out.data     = bufferAlloc(out_sz);
            score_out.data = bufferAlloc(out_sz);
            ori_out.data   = bufferAlloc(out_sz);
            size_out.data  = bufferAlloc(out_sz);

            size_t desc_sz = total_feat * 8 * sizeof(unsigned);
            desc_out.data  = bufferAlloc(desc_sz);
        }

        unsigned offset = 0;
        for (unsigned i = 0; i < max_levels; i++) {
            if (feat_pyr[i] == 0)
                continue;

            if (i > 0)
                offset += feat_pyr[i-1];

            getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
            getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));

            getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned));

            bufferFree(d_x_pyr[i]);
            bufferFree(d_y_pyr[i]);
            bufferFree(d_score_pyr[i]);
            bufferFree(d_ori_pyr[i]);
            bufferFree(d_size_pyr[i]);
            bufferFree(d_desc_pyr[i]);
        }

        // Sets number of output features
        *out_feat = total_feat;
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Example #2
0
void orb(unsigned* out_feat,
         float** d_x,
         float** d_y,
         float** d_score,
         float** d_ori,
         float** d_size,
         unsigned** d_desc,
         std::vector<unsigned>& feat_pyr,
         std::vector<float*>& d_x_pyr,
         std::vector<float*>& d_y_pyr,
         std::vector<unsigned>& lvl_best,
         std::vector<float>& lvl_scl,
         std::vector<CParam<T> >& img_pyr,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels)
{
    unsigned patch_size = REF_PAT_SIZE;

    unsigned max_levels = feat_pyr.size();

    // In future implementations, the user will be capable of passing his
    // distribution instead of using the reference one
    //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));

    std::vector<float*> d_score_pyr(max_levels);
    std::vector<float*> d_ori_pyr(max_levels);
    std::vector<float*> d_size_pyr(max_levels);
    std::vector<unsigned*> d_desc_pyr(max_levels);
    std::vector<unsigned*> d_idx_pyr(max_levels);

    unsigned total_feat = 0;

    // Calculate a separable Gaussian kernel
    unsigned gauss_len = 9;
    convAccT* h_gauss = new convAccT[gauss_len];
    gaussian1D(h_gauss, gauss_len, 2.f);
    Param<convAccT> gauss_filter;
    gauss_filter.dims[0] = gauss_len;
    gauss_filter.strides[0] = 1;

    for (int k = 1; k < 4; k++) {
        gauss_filter.dims[k] = 1;
        gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1];
    }

    dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
    gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));

    delete[] h_gauss;

    for (int i = 0; i < (int)max_levels; i++) {
        if (feat_pyr[i] == 0 || lvl_best[i] == 0) {
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        unsigned* d_usable_feat = memAlloc<unsigned>(1);
        CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned)));

        float* d_x_harris = memAlloc<float>(feat_pyr[i]);
        float* d_y_harris = memAlloc<float>(feat_pyr[i]);
        float* d_score_harris = memAlloc<float>(feat_pyr[i]);

        // Calculate Harris responses
        // Good block_size >= 7 (must be an odd number)
        dim3 threads(THREADS_X, THREADS_Y);
        dim3 blocks(divup(feat_pyr[i], threads.x), 1);
        harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL,
                                                      d_x_pyr[i], d_y_pyr[i], NULL,
                                                      feat_pyr[i], d_usable_feat,
                                                      img_pyr[i], 7, 0.04f, patch_size);
        POST_LAUNCH_CHECK();

        unsigned usable_feat = 0;
        CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost));

        memFree(d_x_pyr[i]);
        memFree(d_y_pyr[i]);
        memFree(d_usable_feat);

        feat_pyr[i] = usable_feat;

        if (feat_pyr[i] == 0) {
            memFree(d_x_harris);
            memFree(d_y_harris);
            memFree(d_score_harris);
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        Param<float> harris_sorted;
        Param<unsigned> harris_idx;

        harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i];
        harris_sorted.strides[0] = harris_idx.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            harris_sorted.dims[k] = 1;
            harris_sorted.strides[k] = harris_sorted.dims[k - 1] * harris_sorted.strides[k - 1];
            harris_idx.dims[k] = 1;
            harris_idx.strides[k] = harris_idx.dims[k - 1] * harris_idx.strides[k - 1];
        }

        dim_type sort_elem = harris_sorted.strides[3] * harris_sorted.dims[3];
        harris_sorted.ptr = d_score_harris;
        harris_idx.ptr = memAlloc<unsigned>(sort_elem);

        // Sort features according to Harris responses
        sort0_index<float, false>(harris_sorted, harris_idx);

        feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]);

        float* d_x_lvl = memAlloc<float>(feat_pyr[i]);
        float* d_y_lvl = memAlloc<float>(feat_pyr[i]);
        float* d_score_lvl = memAlloc<float>(feat_pyr[i]);

        // Keep only features with higher Harris responses
        threads = dim3(THREADS, 1);
        blocks = dim3(divup(feat_pyr[i], threads.x), 1);
        keep_features<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_score_lvl, NULL,
                                              d_x_harris, d_y_harris, harris_sorted.ptr, harris_idx.ptr,
                                              NULL, feat_pyr[i]);
        POST_LAUNCH_CHECK();

        memFree(d_x_harris);
        memFree(d_y_harris);
        memFree(harris_sorted.ptr);
        memFree(harris_idx.ptr);

        float* d_ori_lvl = memAlloc<float>(feat_pyr[i]);

        // Compute orientation of features
        threads = dim3(THREADS_X, THREADS_Y);
        blocks  = dim3(divup(feat_pyr[i], threads.x), 1);
        centroid_angle<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_ori_lvl, feat_pyr[i],
                                               img_pyr[i], patch_size);
        POST_LAUNCH_CHECK();

        Param<T> lvl_tmp;
        Param<T> lvl_filt;

        for (int k = 0; k < 4; k++) {
            lvl_tmp.dims[k] = img_pyr[i].dims[k];
            lvl_tmp.strides[k] = img_pyr[i].strides[k];
            lvl_filt.dims[k] = img_pyr[i].dims[k];
            lvl_filt.strides[k] = img_pyr[i].strides[k];
        }

        dim_type lvl_elem = img_pyr[i].strides[3] * img_pyr[i].dims[3];
        lvl_tmp.ptr = memAlloc<T>(lvl_elem);
        lvl_filt.ptr = memAlloc<T>(lvl_elem);

        // Separable Gaussian filtering to reduce noise sensitivity
        convolve2<T, convAccT, 0, false>(lvl_tmp, img_pyr[i], gauss_filter);
        convolve2<T, convAccT, 1, false>(lvl_filt, CParam<T>(lvl_tmp), gauss_filter);

        memFree(lvl_tmp.ptr);
        if (i > 0) {
            memFree((T*)img_pyr[i].ptr);
        }

        img_pyr[i].ptr = lvl_filt.ptr;
        for (int k = 0; k < 4; k++) {
            img_pyr[i].dims[k] = lvl_filt.dims[k];
            img_pyr[i].strides[k] = lvl_filt.strides[k];
        }

        float* d_size_lvl = memAlloc<float>(feat_pyr[i]);

        unsigned* d_desc_lvl = memAlloc<unsigned>(feat_pyr[i] * 8);
        CUDA_CHECK(cudaMemset(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned)));

        // Compute ORB descriptors
        threads = dim3(THREADS_X, THREADS_Y);
        blocks  = dim3(divup(feat_pyr[i], threads.x), 1);
        extract_orb<T><<<blocks, threads>>>(d_desc_lvl, feat_pyr[i],
                                            d_x_lvl, d_y_lvl, d_ori_lvl, d_size_lvl,
                                            img_pyr[i], lvl_scl[i], patch_size);
        POST_LAUNCH_CHECK();

        memFree((T*)img_pyr[i].ptr);

        // Store results to pyramids
        total_feat += feat_pyr[i];
        d_x_pyr[i] = d_x_lvl;
        d_y_pyr[i] = d_y_lvl;
        d_score_pyr[i] = d_score_lvl;
        d_ori_pyr[i] = d_ori_lvl;
        d_size_pyr[i] = d_size_lvl;
        d_desc_pyr[i] = d_desc_lvl;
    }

    memFree((T*)gauss_filter.ptr);

    // If no features are found, set found features to 0 and return
    if (total_feat == 0) {
        *out_feat = 0;
        return;
    }

    // Allocate output memory
    *d_x     = memAlloc<float>(total_feat);
    *d_y     = memAlloc<float>(total_feat);
    *d_score = memAlloc<float>(total_feat);
    *d_ori   = memAlloc<float>(total_feat);
    *d_size  = memAlloc<float>(total_feat);
    *d_desc  = memAlloc<unsigned>(total_feat * 8);
    unsigned offset = 0;
    for (unsigned i = 0; i < max_levels; i++) {
        if (feat_pyr[i] == 0)
            continue;

        if (i > 0)
            offset += feat_pyr[i-1];

        CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));

        CUDA_CHECK(cudaMemcpy(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned), cudaMemcpyDeviceToDevice));

        memFree(d_x_pyr[i]);
        memFree(d_y_pyr[i]);
        memFree(d_score_pyr[i]);
        memFree(d_ori_pyr[i]);
        memFree(d_size_pyr[i]);
        memFree(d_desc_pyr[i]);
    }

    // Sets number of output features
    *out_feat = total_feat;
}