inline void PackedVector::resize(size_t new_size) { if (new_size < filled) { size_t shrink_capacity = vec.size() / (factor * factor); if (new_size < shrink_capacity) { sdsl::int_vector<> tmp; tmp.width(vec.width()); tmp.resize(new_size); for (size_t i = 0; i < new_size; i++) { tmp[i] = vec[i]; } vec = std::move(tmp); } } else if (new_size > vec.size()) { size_t new_capacity = std::max<size_t>(size_t(vec.size() * factor) + 1, new_size); sdsl::int_vector<> tmp; tmp.width(vec.width()); tmp.resize(new_capacity); for (size_t i = 0; i < filled; i++) { tmp[i] = vec[i]; } vec = std::move(tmp); } filled = new_size; }
Alphabet::Alphabet(const sdsl::int_vector<64>& counts, const sdsl::int_vector<8>& _char2comp, const sdsl::int_vector<8>& _comp2char) : char2comp(_char2comp), comp2char(_comp2char), C(_comp2char.size() + 1, 0), sigma(_comp2char.size()) { for(size_type i = 0; i < counts.size(); i++) { this->C[i + 1] = this->C[i] + counts[i]; } }
gapped_search_result search(const gapped_pattern& pat) const { gapped_search_result res; const vector<string_type>& s = pat.subpatterns; size_type min_gap; size_type max_gap; std::cerr << "REGEX ::: " << pat.raw_regexp << std::endl; min_gap = s[0].size() + pat.gaps[0].first; max_gap = s[0].size() + pat.gaps[0].second; auto last_size = s[s.size() - 1].size(); // get ranges vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its; vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its2; for (auto sx : s) { size_type sp, ep; forward_search(m_text.begin(), m_text.end(), m_sa, 0, m_sa.size()-1, sx.begin(), sx.end(), sp, ep); its.push_back(m_sa.begin() + sp); its2.push_back(m_sa.begin() + ep + 1); } vector<pii> spans1, spans2; for (auto it = its[0]; it != its2[0]; ++it) spans1.emplace_back(*it, *it); auto pspan1 = &spans1; auto pspan2 = &spans2; // incremental search for (size_t i = 1; i < its.size(); ++i) { std::sort(pspan1->begin(), pspan1->end()); pspan2->clear(); for (auto it = its[i]; it != its2[i]; ++it) { auto pos = *it; for (auto pot_match = lower_bound(pspan1->begin(), pspan1->end(), make_pair(pos - max_gap, pos)); pot_match != pspan1->end() && pot_match->first <= pos - min_gap; ++pot_match) pspan2->emplace_back(pos, pot_match->second); } std::swap(pspan1, pspan2); } std::sort(pspan1->begin(), pspan1->end(), [](const pii &left, const pii &right) { return left.second < right.second; }); for (auto it = pspan1->begin(); it != pspan1->end(); ++it) { res.positions.push_back(it->second); auto end = it->first + last_size; while (it != pspan1->end() && it->second < end) ++it; } return res; }
inline void PackedVector::reserve(const size_t& future_size) { if (future_size > vec.size()) { sdsl::int_vector<> tmp; tmp.width(vec.width()); tmp.resize(future_size); for (size_t i = 0; i < filled; i++) { tmp[i] = vec[i]; } vec = std::move(tmp); } }
rank_bm25(cache_config& cconfig) { uint64_t num_terms; load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; avg_doc_len = (double)num_terms / (double)num_docs; std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl; }
rank_tfidf(cache_config& cconfig) { load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end()); min_doc_len = *min_itr; std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl; }
rank_bm25(collection& col) { load_from_file(doc_lengths,col.file_map[KEY_DOCLEN]); num_docs = doc_lengths.size(); { sdsl::int_vector_mapper<> text(col.file_map[KEY_TEXT]); num_terms = text.size() - num_docs; } avg_doc_len = (double)num_terms / (double)num_docs; LOG(INFO) << "num docs : " << num_docs; LOG(INFO) << "avg doc len : " << avg_doc_len; }
inline void PackedVector::set(const size_t& i, const uint64_t& value) { assert(i < filled); uint8_t width = vec.width(); uint64_t mask = std::numeric_limits<uint64_t>::max() << width; while (mask & value) { width++; mask = std::numeric_limits<uint64_t>::max() << width; } if (width > vec.width()) { sdsl::int_vector<> wider_vec; wider_vec.width(width); wider_vec.resize(vec.size()); for (size_t i = 0; i < filled; i++) { wider_vec[i] = vec[i]; } vec = std::move(wider_vec); } vec[i] = value; }