예제 #1
0
        gapped_search_result
        search(const gapped_pattern& pat) const
        {
            gapped_search_result res;
            const vector<string_type>& s = pat.subpatterns;
            size_type min_gap;
            size_type max_gap;

            std::cerr << "REGEX ::: " << pat.raw_regexp << std::endl;

            min_gap = s[0].size() + pat.gaps[0].first;
            max_gap = s[0].size() + pat.gaps[0].second;

            auto last_size = s[s.size() - 1].size();

            // get ranges
            vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its;
            vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its2;

            for (auto sx : s) {
                size_type sp, ep;
                forward_search(m_text.begin(), m_text.end(), m_sa, 0, m_sa.size()-1, sx.begin(), sx.end(), sp, ep);
                its.push_back(m_sa.begin() + sp);
                its2.push_back(m_sa.begin() + ep + 1);
            }

            vector<pii> spans1, spans2;
            for (auto it = its[0]; it != its2[0]; ++it)
		spans1.emplace_back(*it, *it);

            auto pspan1 = &spans1;
            auto pspan2 = &spans2;

            // incremental search
            for (size_t i = 1; i < its.size(); ++i) {
                std::sort(pspan1->begin(), pspan1->end());
                pspan2->clear();
                for (auto it = its[i]; it != its2[i]; ++it) {
                    auto pos = *it;
                    for (auto pot_match = lower_bound(pspan1->begin(), pspan1->end(), make_pair(pos - max_gap, pos));
                         pot_match != pspan1->end() && pot_match->first <= pos - min_gap;
                         ++pot_match)
                        pspan2->emplace_back(pos, pot_match->second);
                }
                std::swap(pspan1, pspan2);
            }
            std::sort(pspan1->begin(), pspan1->end(), [](const pii &left, const pii &right) { return left.second < right.second; });
            
            for (auto it = pspan1->begin(); it != pspan1->end(); ++it) {
                res.positions.push_back(it->second);
                auto end = it->first + last_size;
                while (it != pspan1->end() && it->second < end)
                    ++it;
            }

            return res;
        }
예제 #2
0
        index_bstree(collection& col)
        {
            sdsl::load_from_file(m_text, col.file_map[consts::KEY_TEXT]);

            sdsl::csa_wt<sdsl::wt_huff<>, 1> csa;
            sdsl::construct(csa, col.file_map[consts::KEY_TEXT], 0);
            m_sa = sdsl::int_vector<>(csa.size(), 0, sdsl::bits::hi(csa.size() - 1) + 1);
            std::copy(csa.begin(), csa.end(), m_sa.begin());
        }
예제 #3
0
	rank_tfidf(cache_config& cconfig) {
        load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
        if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
            surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
        }
        load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
		num_docs = doc_lengths.size();
        std::cerr<<"num_docs = "<<num_docs<<std::endl;
	    auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end());
	    min_doc_len = *min_itr;
        std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl;
	}