static void compute_nfac(t_dict_idx& idx, const sdsl::int_vector_mapper<8, std::ios_base::in>& dict, segment_info& segment) { auto seg_start = dict.begin() + segment.offset; auto seg_end = seg_start + segment.length; auto factor_itr = idx.factorize_restricted(seg_start, seg_end); uint64_t factors_produced = 0; while (!factor_itr.finished()) { factors_produced++; ++factor_itr; } segment.num_factors_req = factors_produced; }
index_sada(collection& col) { file_name = col.path +"index/"+name+"-"+sdsl::util::class_to_hash(*this)+".idx"; if (utils::file_exists(file_name)) { // load LOG(INFO) << "LOAD from file '" << file_name << "'"; std::ifstream ifs(file_name); load(ifs); m_doc_rmin_marked = sdsl::bit_vector(m_doc_cnt, 0); m_doc_rmax_marked = sdsl::bit_vector(m_doc_cnt, 0); } else { // construct LOG(INFO) << "CONSTRUCT sada index"; LOG(INFO) << "CONSTRUCT DOC ISA"; { m_doc_isa.resize(m_doc_cnt); std::vector<uint64_t> doc_buffer; sdsl::int_vector_mapper<> text(col.file_map[KEY_TEXTPERM]); size_type doc_id = 0; for (size_type i = 0; i < text.size(); ++i) { if (1ULL == text[i]) { if (doc_buffer.size() > 0) { doc_buffer.push_back(0); { sdsl::int_vector<> sa(doc_buffer.size(), 0, sdsl::bits::hi(doc_buffer.size())+1); sdsl::qsufsort::construct_sa(sa, doc_buffer); sdsl::util::bit_compress(sa); m_doc_isa[doc_id] = sa; for (size_type j = 0; j < doc_buffer.size(); ++j) { m_doc_isa[doc_id][sa[j]] = j; } } } ++doc_id; doc_buffer.clear(); } else { doc_buffer.push_back(text[i]); } } } { const sdsl::int_vector_mapper<0,std::ios_base::in> D(col.file_map[KEY_D]); { LOG(INFO) << "CONSTRUCT CPrev"; sdsl::int_vector<> Cprev(D.size(), 0, sdsl::bits::hi(D.size())+1); { sdsl::int_vector<> last_occ(m_doc_cnt+1, 0, sdsl::bits::hi(D.size())+1); for (size_type i = 0; i < D.size(); ++i) { size_type doc = D[i]; Cprev[i] = last_occ[doc]; last_occ[doc] = i; } } m_rminq = range_min_type(&Cprev); } { LOG(INFO) << "CONSTRUCT CNext"; sdsl::int_vector<> Cnext(D.size(), 0, sdsl::bits::hi(D.size())+1); { sdsl::int_vector<> last_occ(m_doc_cnt+1, D.size(), sdsl::bits::hi(D.size())+1); for (size_type i = 0, j = D.size()-1; i < D.size(); ++i, --j) { size_type doc = D[j]; Cnext[j] = last_occ[doc]; last_occ[doc] = j; } } m_rmaxq = range_max_type(&Cnext); } } m_doc_rmin_marked = sdsl::bit_vector(m_doc_cnt, 0); m_doc_rmax_marked = sdsl::bit_vector(m_doc_cnt, 0); LOG(INFO) << "CONSTRUCT CSA"; sdsl::cache_config cfg; cfg.delete_files = false; cfg.dir = col.path + "/tmp/"; cfg.id = "TMP"; cfg.file_map[sdsl::conf::KEY_SA] = col.file_map[KEY_SA]; cfg.file_map[sdsl::conf::KEY_TEXT_INT] = col.file_map[KEY_TEXTPERM]; construct(m_csa_full,col.file_map[KEY_TEXTPERM],cfg,0); LOG(INFO) << "CONSTRUCT DOC BORDER RANK"; sdsl::bit_vector dbv; sdsl::load_from_file(dbv,col.file_map[KEY_DBV]); m_doc_border = doc_border_type(dbv); m_doc_border_rank = doc_border_rank_type(&m_doc_border); m_doc_border_select = doc_border_select_type(&m_doc_border); m_doc_cnt = m_doc_border_rank(m_doc_border.size()); LOG(INFO) << "STORE to file '" << file_name << "'"; std::ofstream ofs(file_name); auto bytes = serialize(ofs); std::ofstream vofs(file_name+".html"); sdsl::write_structure<sdsl::HTML_FORMAT>(vofs,*this); LOG(INFO) << "sada index size : " << bytes / (1024*1024) << " MB"; } }
static void prune(collection& col, bool rebuild, uint64_t target_dict_size_bytes, uint64_t num_threads) { auto start_total = hrclock::now(); auto start_dict_size_bytes = 0ULL; { const sdsl::int_vector_mapper<8, std::ios_base::in> dict(col.file_map[KEY_DICT]); start_dict_size_bytes = dict.size(); } if (start_dict_size_bytes == target_dict_size_bytes) { LOG(INFO) << "\t" << "No dictionary pruning necessary."; return; } auto dict_hash = col.param_map[PARAM_DICT_HASH]; auto new_dict_file = file_name(col, target_dict_size_bytes, dict_hash); if (!rebuild && utils::file_exists(new_dict_file)) { LOG(INFO) << "\t" << "Pruned dictionary exists at '" << new_dict_file << "'"; col.file_map[KEY_DICT] = new_dict_file; col.compute_dict_hash(); return; } /* (1) create or load statistics */ LOG(INFO) << "\t" << "Create or load statistics."; auto dict_file = col.file_map[KEY_DICT]; auto dict_stats_file = dict_file + "-" + KEY_DICT_STATISTICS + "-" + t_factorization_strategy::type() + ".sdsl"; factorization_statistics fstats; if (rebuild || !utils::file_exists(dict_stats_file)) { fstats = t_factorization_strategy::template parallel_factorize<factor_tracker>(col, rebuild, num_threads); sdsl::store_to_file(fstats, dict_stats_file); } else { sdsl::load_from_file(fstats, dict_stats_file); } /* (2) find segments */ LOG(INFO) << "\t" << "Find potential segments to remove."; size_t total_len = 0; auto bytes_to_remove = start_dict_size_bytes - target_dict_size_bytes; auto freq_threshold = t_freq_threshold; std::vector<segment_info> segments; while (total_len < bytes_to_remove) { segments.clear(); total_len = 0; size_t run_len = 0; size_t total_byte_usage = 0; for (size_t i = 0; i < fstats.dict_usage.size(); i++) { if (fstats.dict_usage[i] <= freq_threshold) { run_len++; total_byte_usage += fstats.dict_usage[i]; } else { if (run_len >= t_length_threshold) { auto seg_start = i - run_len; segments.emplace_back(seg_start, run_len, total_byte_usage, 0); total_len += run_len; } run_len = 0; total_byte_usage = 0; } } LOG(INFO) << "Freq threshold = " << freq_threshold << " Found bytes = " << total_len << " Req = " << bytes_to_remove; freq_threshold *= 2; } LOG(INFO) << "\t" << "Freq threshold = " << freq_threshold / 2 << " Length threshold = " << t_length_threshold << " Found bytes = " << total_len; LOG(INFO) << "\t" << "Found " << segments.size() << " segments of total length " << total_len << " (" << total_len / (1024 * 1024) << " MiB)"; /* (3) compute the metric for those segments */ { LOG(INFO) << "Create/Load dictionary index"; t_dict_idx idx(col, rebuild); const sdsl::int_vector_mapper<8, std::ios_base::in> dict(col.file_map[KEY_DICT]); for (size_t i = 0; i < segments.size(); i++) { compute_nfac(idx, dict, segments[i]); } } /* (3) sort by method */ LOG(INFO) << "Sort segments by weight"; if (t_method == FF) { /* FF */ std::sort(segments.begin(), segments.end(), [](const segment_info& a, const segment_info& b) { double score_a = (double)a.num_factors_req * ((double)a.total_byte_usage / (double)a.length); double score_b = (double)b.num_factors_req * ((double)b.total_byte_usage / (double)b.length); return score_a < score_b; }); } else { /* FFT */ std::sort(segments.begin(), segments.end(), [](const segment_info& a, const segment_info& b) { double score_a = (double)a.num_factors_req * ((double)a.total_byte_usage / (double)a.length); score_a = score_a / (double)a.length; double score_b = (double)b.num_factors_req * ((double)b.total_byte_usage / (double)b.length); score_b = score_b / (double)b.length; return score_a < score_b; }); } size_t segments_to_remove = 0; size_t segment_cum_len = 0; for (size_t i = 0; i < segments.size(); i++) { segments_to_remove++; segment_cum_len += segments[i].length; if (segment_cum_len >= bytes_to_remove) { // update the length so it fits into the size we want auto new_len = segment_cum_len - bytes_to_remove; segments[i].length -= (new_len + 1); break; } } segments.resize(segments_to_remove); LOG(INFO) << "Selected " << segments_to_remove << " for removal"; LOG(INFO) << "Creating pruned dictionary"; LOG(INFO) << "Sorting segments into offset order"; std::sort(segments.begin(), segments.end(), [](const segment_info& a, const segment_info& b) { return a.offset < b.offset; }); { const sdsl::int_vector_mapper<8, std::ios_base::in> dict(col.file_map[KEY_DICT]); auto wdict = sdsl::write_out_buffer<8>::create(new_dict_file); size_t cur_segment = 0; for (size_t i = 0; i < dict.size() - 2;) { if (segments[cur_segment].offset == i) { /* skip the segment */ i += segments[cur_segment].length; cur_segment++; } else { wdict.push_back(dict[i]); i++; } } wdict.push_back(0); LOG(INFO) << "\t" << "Pruned dictionary size = " << wdict.size() / (1024 * 1024) << " MiB"; } col.file_map[KEY_DICT] = new_dict_file; auto end_total = hrclock::now(); LOG(INFO) << "\n" << "\t" << type() + " Total time = " << duration_cast<milliseconds>(end_total - start_total).count() / 1000.0f << " sec"; col.compute_dict_hash(); }