Exemple #1
0
inline void PackedVector::resize(size_t new_size) {
    if (new_size < filled) {
        size_t shrink_capacity = vec.size() / (factor * factor);
        if (new_size < shrink_capacity) {
            sdsl::int_vector<> tmp;
            tmp.width(vec.width());
            tmp.resize(new_size);
            for (size_t i = 0; i < new_size; i++) {
                tmp[i] = vec[i];
            }
            vec = std::move(tmp);
        }
    }
    else if (new_size > vec.size()) {
        size_t new_capacity = std::max<size_t>(size_t(vec.size() * factor) + 1, new_size);
        sdsl::int_vector<> tmp;
        tmp.width(vec.width());
        tmp.resize(new_capacity);
        for (size_t i = 0; i < filled; i++) {
            tmp[i] = vec[i];
        }
        vec = std::move(tmp);
    }
    filled = new_size;
}
Exemple #2
0
 size_type serialize(std::ostream& out, sdsl::structure_tree_node* v=NULL, std::string name="")const
 {
     (void)name;
     auto size = m_sa.serialize(out, v, "sa");
     size += m_text.serialize(out, v, "text");
     return size;
 }
Exemple #3
0
 void swap(index_bstree& ir)
 {
     if (this != &ir) {
         m_sa.swap(ir.m_sa);
         m_text.swap(ir.m_text);
     }
 }
Exemple #4
0
Alphabet::Alphabet(const sdsl::int_vector<64>& counts,
  const sdsl::int_vector<8>& _char2comp, const sdsl::int_vector<8>& _comp2char) :
  char2comp(_char2comp), comp2char(_comp2char),
  C(_comp2char.size() + 1, 0),
  sigma(_comp2char.size())
{
  for(size_type i = 0; i < counts.size(); i++) { this->C[i + 1] = this->C[i] + counts[i]; }
}
Exemple #5
0
        gapped_search_result
        search(const gapped_pattern& pat) const
        {
            gapped_search_result res;
            const vector<string_type>& s = pat.subpatterns;
            size_type min_gap;
            size_type max_gap;

            std::cerr << "REGEX ::: " << pat.raw_regexp << std::endl;

            min_gap = s[0].size() + pat.gaps[0].first;
            max_gap = s[0].size() + pat.gaps[0].second;

            auto last_size = s[s.size() - 1].size();

            // get ranges
            vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its;
            vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its2;

            for (auto sx : s) {
                size_type sp, ep;
                forward_search(m_text.begin(), m_text.end(), m_sa, 0, m_sa.size()-1, sx.begin(), sx.end(), sp, ep);
                its.push_back(m_sa.begin() + sp);
                its2.push_back(m_sa.begin() + ep + 1);
            }

            vector<pii> spans1, spans2;
            for (auto it = its[0]; it != its2[0]; ++it)
		spans1.emplace_back(*it, *it);

            auto pspan1 = &spans1;
            auto pspan2 = &spans2;

            // incremental search
            for (size_t i = 1; i < its.size(); ++i) {
                std::sort(pspan1->begin(), pspan1->end());
                pspan2->clear();
                for (auto it = its[i]; it != its2[i]; ++it) {
                    auto pos = *it;
                    for (auto pot_match = lower_bound(pspan1->begin(), pspan1->end(), make_pair(pos - max_gap, pos));
                         pot_match != pspan1->end() && pot_match->first <= pos - min_gap;
                         ++pot_match)
                        pspan2->emplace_back(pos, pot_match->second);
                }
                std::swap(pspan1, pspan2);
            }
            std::sort(pspan1->begin(), pspan1->end(), [](const pii &left, const pii &right) { return left.second < right.second; });
            
            for (auto it = pspan1->begin(); it != pspan1->end(); ++it) {
                res.positions.push_back(it->second);
                auto end = it->first + last_size;
                while (it != pspan1->end() && it->second < end)
                    ++it;
            }

            return res;
        }
Exemple #6
0
inline void PackedVector::reserve(const size_t& future_size) {
    if (future_size > vec.size()) {
        sdsl::int_vector<> tmp;
        tmp.width(vec.width());
        tmp.resize(future_size);
        for (size_t i = 0; i < filled; i++) {
            tmp[i] = vec[i];
        }
        vec = std::move(tmp);
    }
}
Exemple #7
0
	rank_tfidf(cache_config& cconfig) {
        load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
        if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
            surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
        }
        load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
		num_docs = doc_lengths.size();
        std::cerr<<"num_docs = "<<num_docs<<std::endl;
	    auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end());
	    min_doc_len = *min_itr;
        std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl;
	}
Exemple #8
0
        index_bstree(collection& col)
        {
            sdsl::load_from_file(m_text, col.file_map[consts::KEY_TEXT]);

            sdsl::csa_wt<sdsl::wt_huff<>, 1> csa;
            sdsl::construct(csa, col.file_map[consts::KEY_TEXT], 0);
            m_sa = sdsl::int_vector<>(csa.size(), 0, sdsl::bits::hi(csa.size() - 1) + 1);
            std::copy(csa.begin(), csa.end(), m_sa.begin());
        }
 void load(std::ifstream& ifs)
 {
     sdsl::read_member(num_docs,ifs);
     sdsl::read_member(num_terms,ifs);
     sdsl::read_member(avg_doc_len,ifs);
     LOG(INFO) << "num docs : " << num_docs;
     LOG(INFO) << "num terms : " << num_terms;
     LOG(INFO) << "avg doc len : " << avg_doc_len;
     doc_lengths.load(ifs);
 }
Exemple #10
0
 size_type serialize(std::ostream& out, sdsl::structure_tree_node* v=NULL, std::string name="") const
 {
     sdsl::structure_tree_node* child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this));
     size_type written_bytes = 0;
     written_bytes += sdsl::write_member(num_docs,out,child,"num_docs");
     written_bytes += sdsl::write_member(num_terms,out,child,"num_terms");
     written_bytes += sdsl::write_member(avg_doc_len,out,child,"avg_doc_len");
     written_bytes += doc_lengths.serialize(out,child,"doc lengths");
     sdsl::structure_tree::add_size(child, written_bytes);
     return written_bytes;
 }
Exemple #11
0
	rank_bm25(cache_config& cconfig) {
		uint64_t num_terms;
        load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
        if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
            surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
        }
        load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
		num_docs = doc_lengths.size();
        std::cerr<<"num_docs = "<<num_docs<<std::endl;
	    avg_doc_len = (double)num_terms / (double)num_docs;
        std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl;
	}
Exemple #12
0
inline void PackedVector::set(const size_t& i, const uint64_t& value) {
    assert(i < filled);
        
    uint8_t width = vec.width();
    uint64_t mask = std::numeric_limits<uint64_t>::max() << width;
    while (mask & value) {
        width++;
        mask = std::numeric_limits<uint64_t>::max() << width;
    }
        
    if (width > vec.width()) {
        sdsl::int_vector<> wider_vec;
        wider_vec.width(width);
        wider_vec.resize(vec.size());
        for (size_t i = 0; i < filled; i++) {
            wider_vec[i] = vec[i];
        }
        vec = std::move(wider_vec);
    }
        
    vec[i] = value;
}
Exemple #13
0
    rank_bm25(collection& col)
    {
        load_from_file(doc_lengths,col.file_map[KEY_DOCLEN]);
        num_docs = doc_lengths.size();
        {
            sdsl::int_vector_mapper<> text(col.file_map[KEY_TEXT]);
            num_terms = text.size() - num_docs;
        }
        avg_doc_len = (double)num_terms / (double)num_docs;

        LOG(INFO) << "num docs : " << num_docs;
        LOG(INFO) << "avg doc len : " << avg_doc_len;
    }
Exemple #14
0
inline void PackedVector::clear() {
    vec.resize(0);
    vec.width(1);
    filled = 0;
}
Exemple #15
0
 void load(std::istream& in)
 {
     m_sa.load(in);
     m_text.load(in);
 }