gapped_search_result search(const gapped_pattern& pat) const { gapped_search_result res; const vector<string_type>& s = pat.subpatterns; size_type min_gap; size_type max_gap; std::cerr << "REGEX ::: " << pat.raw_regexp << std::endl; min_gap = s[0].size() + pat.gaps[0].first; max_gap = s[0].size() + pat.gaps[0].second; auto last_size = s[s.size() - 1].size(); // get ranges vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its; vector<sdsl::int_vector_const_iterator<sdsl::int_vector<0>>::const_iterator> its2; for (auto sx : s) { size_type sp, ep; forward_search(m_text.begin(), m_text.end(), m_sa, 0, m_sa.size()-1, sx.begin(), sx.end(), sp, ep); its.push_back(m_sa.begin() + sp); its2.push_back(m_sa.begin() + ep + 1); } vector<pii> spans1, spans2; for (auto it = its[0]; it != its2[0]; ++it) spans1.emplace_back(*it, *it); auto pspan1 = &spans1; auto pspan2 = &spans2; // incremental search for (size_t i = 1; i < its.size(); ++i) { std::sort(pspan1->begin(), pspan1->end()); pspan2->clear(); for (auto it = its[i]; it != its2[i]; ++it) { auto pos = *it; for (auto pot_match = lower_bound(pspan1->begin(), pspan1->end(), make_pair(pos - max_gap, pos)); pot_match != pspan1->end() && pot_match->first <= pos - min_gap; ++pot_match) pspan2->emplace_back(pos, pot_match->second); } std::swap(pspan1, pspan2); } std::sort(pspan1->begin(), pspan1->end(), [](const pii &left, const pii &right) { return left.second < right.second; }); for (auto it = pspan1->begin(); it != pspan1->end(); ++it) { res.positions.push_back(it->second); auto end = it->first + last_size; while (it != pspan1->end() && it->second < end) ++it; } return res; }
rank_tfidf(cache_config& cconfig) { load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end()); min_doc_len = *min_itr; std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl; }