inline void PackedVector::resize(size_t new_size) { if (new_size < filled) { size_t shrink_capacity = vec.size() / (factor * factor); if (new_size < shrink_capacity) { sdsl::int_vector<> tmp; tmp.width(vec.width()); tmp.resize(new_size); for (size_t i = 0; i < new_size; i++) { tmp[i] = vec[i]; } vec = std::move(tmp); } } else if (new_size > vec.size()) { size_t new_capacity = std::max<size_t>(size_t(vec.size() * factor) + 1, new_size); sdsl::int_vector<> tmp; tmp.width(vec.width()); tmp.resize(new_capacity); for (size_t i = 0; i < filled; i++) { tmp[i] = vec[i]; } vec = std::move(tmp); } filled = new_size; }
size_type serialize(std::ostream& out, sdsl::structure_tree_node* v=NULL, std::string name="")const { (void)name; auto size = m_sa.serialize(out, v, "sa"); size += m_text.serialize(out, v, "text"); return size; }
void swap(index_bstree& ir) { if (this != &ir) { m_sa.swap(ir.m_sa); m_text.swap(ir.m_text); } }
Alphabet::Alphabet(const sdsl::int_vector<64>& counts, const sdsl::int_vector<8>& _char2comp, const sdsl::int_vector<8>& _comp2char) : char2comp(_char2comp), comp2char(_comp2char), C(_comp2char.size() + 1, 0), sigma(_comp2char.size()) { for(size_type i = 0; i < counts.size(); i++) { this->C[i + 1] = this->C[i] + counts[i]; } }
gapped_search_result search(const gapped_pattern& pat) const { gapped_search_result res; const vector<string_type>& s = pat.subpatterns; size_type min_gap; size_type max_gap; std::cerr << "REGEX ::: " << pat.raw_regexp << std::endl; min_gap = s[0].size() + pat.gaps[0].first; max_gap = s[0].size() + pat.gaps[0].second; auto last_size = s[s.size() - 1].size(); // get ranges vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its; vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its2; for (auto sx : s) { size_type sp, ep; forward_search(m_text.begin(), m_text.end(), m_sa, 0, m_sa.size()-1, sx.begin(), sx.end(), sp, ep); its.push_back(m_sa.begin() + sp); its2.push_back(m_sa.begin() + ep + 1); } vector<pii> spans1, spans2; for (auto it = its[0]; it != its2[0]; ++it) spans1.emplace_back(*it, *it); auto pspan1 = &spans1; auto pspan2 = &spans2; // incremental search for (size_t i = 1; i < its.size(); ++i) { std::sort(pspan1->begin(), pspan1->end()); pspan2->clear(); for (auto it = its[i]; it != its2[i]; ++it) { auto pos = *it; for (auto pot_match = lower_bound(pspan1->begin(), pspan1->end(), make_pair(pos - max_gap, pos)); pot_match != pspan1->end() && pot_match->first <= pos - min_gap; ++pot_match) pspan2->emplace_back(pos, pot_match->second); } std::swap(pspan1, pspan2); } std::sort(pspan1->begin(), pspan1->end(), [](const pii &left, const pii &right) { return left.second < right.second; }); for (auto it = pspan1->begin(); it != pspan1->end(); ++it) { res.positions.push_back(it->second); auto end = it->first + last_size; while (it != pspan1->end() && it->second < end) ++it; } return res; }
inline void PackedVector::reserve(const size_t& future_size) { if (future_size > vec.size()) { sdsl::int_vector<> tmp; tmp.width(vec.width()); tmp.resize(future_size); for (size_t i = 0; i < filled; i++) { tmp[i] = vec[i]; } vec = std::move(tmp); } }
rank_tfidf(cache_config& cconfig) { load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end()); min_doc_len = *min_itr; std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl; }
index_bstree(collection& col) { sdsl::load_from_file(m_text, col.file_map[consts::KEY_TEXT]); sdsl::csa_wt<sdsl::wt_huff<>, 1> csa; sdsl::construct(csa, col.file_map[consts::KEY_TEXT], 0); m_sa = sdsl::int_vector<>(csa.size(), 0, sdsl::bits::hi(csa.size() - 1) + 1); std::copy(csa.begin(), csa.end(), m_sa.begin()); }
void load(std::ifstream& ifs) { sdsl::read_member(num_docs,ifs); sdsl::read_member(num_terms,ifs); sdsl::read_member(avg_doc_len,ifs); LOG(INFO) << "num docs : " << num_docs; LOG(INFO) << "num terms : " << num_terms; LOG(INFO) << "avg doc len : " << avg_doc_len; doc_lengths.load(ifs); }
size_type serialize(std::ostream& out, sdsl::structure_tree_node* v=NULL, std::string name="") const { sdsl::structure_tree_node* child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); size_type written_bytes = 0; written_bytes += sdsl::write_member(num_docs,out,child,"num_docs"); written_bytes += sdsl::write_member(num_terms,out,child,"num_terms"); written_bytes += sdsl::write_member(avg_doc_len,out,child,"avg_doc_len"); written_bytes += doc_lengths.serialize(out,child,"doc lengths"); sdsl::structure_tree::add_size(child, written_bytes); return written_bytes; }
rank_bm25(cache_config& cconfig) { uint64_t num_terms; load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; avg_doc_len = (double)num_terms / (double)num_docs; std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl; }
inline void PackedVector::set(const size_t& i, const uint64_t& value) { assert(i < filled); uint8_t width = vec.width(); uint64_t mask = std::numeric_limits<uint64_t>::max() << width; while (mask & value) { width++; mask = std::numeric_limits<uint64_t>::max() << width; } if (width > vec.width()) { sdsl::int_vector<> wider_vec; wider_vec.width(width); wider_vec.resize(vec.size()); for (size_t i = 0; i < filled; i++) { wider_vec[i] = vec[i]; } vec = std::move(wider_vec); } vec[i] = value; }
rank_bm25(collection& col) { load_from_file(doc_lengths,col.file_map[KEY_DOCLEN]); num_docs = doc_lengths.size(); { sdsl::int_vector_mapper<> text(col.file_map[KEY_TEXT]); num_terms = text.size() - num_docs; } avg_doc_len = (double)num_terms / (double)num_docs; LOG(INFO) << "num docs : " << num_docs; LOG(INFO) << "avg doc len : " << avg_doc_len; }
inline void PackedVector::clear() { vec.resize(0); vec.width(1); filled = 0; }
void load(std::istream& in) { m_sa.load(in); m_text.load(in); }