Exemplo n.º 1
0
        index_sada(collection& col)
        {
            file_name = col.path +"index/"+name+"-"+sdsl::util::class_to_hash(*this)+".idx";
            if (utils::file_exists(file_name)) {  // load
                LOG(INFO) << "LOAD from file '" << file_name << "'";
                std::ifstream ifs(file_name);
                load(ifs);
                m_doc_rmin_marked = sdsl::bit_vector(m_doc_cnt, 0);
                m_doc_rmax_marked = sdsl::bit_vector(m_doc_cnt, 0);
            } else { // construct
                LOG(INFO) << "CONSTRUCT sada index";

                LOG(INFO) << "CONSTRUCT DOC ISA";
                {
                    m_doc_isa.resize(m_doc_cnt);
                    std::vector<uint64_t> doc_buffer;
                    sdsl::int_vector_mapper<> text(col.file_map[KEY_TEXTPERM]);
                    size_type doc_id = 0;
                    for (size_type i = 0; i < text.size(); ++i) {
                        if (1ULL == text[i]) {
                            if (doc_buffer.size() > 0) {
                                doc_buffer.push_back(0);
                                {
                                    sdsl::int_vector<> sa(doc_buffer.size(), 0, sdsl::bits::hi(doc_buffer.size())+1);
                                    sdsl::qsufsort::construct_sa(sa, doc_buffer);
                                    sdsl::util::bit_compress(sa);
                                    m_doc_isa[doc_id] = sa;
                                    for (size_type j = 0; j < doc_buffer.size(); ++j) {
                                        m_doc_isa[doc_id][sa[j]] = j;
                                    }
                                }
                            }
                            ++doc_id;
                            doc_buffer.clear();
                        } else {
                            doc_buffer.push_back(text[i]);
                        }
                    }
                }

                {
                    const sdsl::int_vector_mapper<0,std::ios_base::in> D(col.file_map[KEY_D]);
                    {
                        LOG(INFO) << "CONSTRUCT CPrev";
                        sdsl::int_vector<> Cprev(D.size(), 0, sdsl::bits::hi(D.size())+1);
                        {
                            sdsl::int_vector<> last_occ(m_doc_cnt+1, 0, sdsl::bits::hi(D.size())+1);
                            for (size_type i = 0; i < D.size(); ++i) {
                                size_type doc = D[i];
                                Cprev[i]      = last_occ[doc];
                                last_occ[doc] = i;
                            }
                        }
                        m_rminq = range_min_type(&Cprev);
                    }
                    {
                        LOG(INFO) << "CONSTRUCT CNext";
                        sdsl::int_vector<> Cnext(D.size(), 0, sdsl::bits::hi(D.size())+1);
                        {
                            sdsl::int_vector<> last_occ(m_doc_cnt+1, D.size(), sdsl::bits::hi(D.size())+1);
                            for (size_type i = 0, j = D.size()-1; i < D.size(); ++i, --j) {
                                size_type doc = D[j];
                                Cnext[j]      = last_occ[doc];
                                last_occ[doc] = j;
                            }
                        }
                        m_rmaxq = range_max_type(&Cnext);
                    }
                }
                m_doc_rmin_marked = sdsl::bit_vector(m_doc_cnt, 0);
                m_doc_rmax_marked = sdsl::bit_vector(m_doc_cnt, 0);


                LOG(INFO) << "CONSTRUCT CSA";
                sdsl::cache_config cfg;
                cfg.delete_files = false;
                cfg.dir = col.path + "/tmp/";
                cfg.id = "TMP";
                cfg.file_map[sdsl::conf::KEY_SA] = col.file_map[KEY_SA];
                cfg.file_map[sdsl::conf::KEY_TEXT_INT] = col.file_map[KEY_TEXTPERM];
                construct(m_csa_full,col.file_map[KEY_TEXTPERM],cfg,0);

                LOG(INFO) << "CONSTRUCT DOC BORDER RANK";
                sdsl::bit_vector dbv;
                sdsl::load_from_file(dbv,col.file_map[KEY_DBV]);
                m_doc_border = doc_border_type(dbv);
                m_doc_border_rank   = doc_border_rank_type(&m_doc_border);
                m_doc_border_select = doc_border_select_type(&m_doc_border);
                m_doc_cnt = m_doc_border_rank(m_doc_border.size());

                LOG(INFO) << "STORE to file '" << file_name << "'";
                std::ofstream ofs(file_name);
                auto bytes = serialize(ofs);
                std::ofstream vofs(file_name+".html");
                sdsl::write_structure<sdsl::HTML_FORMAT>(vofs,*this);

                LOG(INFO) << "sada index size : " << bytes / (1024*1024) << " MB";
            }
        }
Exemplo n.º 2
0
    static void prune(collection& col, bool rebuild, uint64_t target_dict_size_bytes, uint64_t num_threads)
    {
        auto start_total = hrclock::now();
        auto start_dict_size_bytes = 0ULL;
        {
            const sdsl::int_vector_mapper<8, std::ios_base::in> dict(col.file_map[KEY_DICT]);
            start_dict_size_bytes = dict.size();
        }
        if (start_dict_size_bytes == target_dict_size_bytes) {
            LOG(INFO) << "\t"
                      << "No dictionary pruning necessary.";
            return;
        }
        auto dict_hash = col.param_map[PARAM_DICT_HASH];
        auto new_dict_file = file_name(col, target_dict_size_bytes, dict_hash);
        if (!rebuild && utils::file_exists(new_dict_file)) {
            LOG(INFO) << "\t"
                      << "Pruned dictionary exists at '" << new_dict_file << "'";
            col.file_map[KEY_DICT] = new_dict_file;
            col.compute_dict_hash();
            return;
        }

        /* (1) create or load statistics */
        LOG(INFO) << "\t"
                  << "Create or load statistics.";
        auto dict_file = col.file_map[KEY_DICT];
        auto dict_stats_file = dict_file + "-" + KEY_DICT_STATISTICS + "-" + t_factorization_strategy::type() + ".sdsl";
        factorization_statistics fstats;
        if (rebuild || !utils::file_exists(dict_stats_file)) {
            fstats = t_factorization_strategy::template parallel_factorize<factor_tracker>(col, rebuild, num_threads);
            sdsl::store_to_file(fstats, dict_stats_file);
        }
        else {
            sdsl::load_from_file(fstats, dict_stats_file);
        }

        /* (2) find segments */
        LOG(INFO) << "\t"
                  << "Find potential segments to remove.";

        size_t total_len = 0;
        auto bytes_to_remove = start_dict_size_bytes - target_dict_size_bytes;
        auto freq_threshold = t_freq_threshold;
        std::vector<segment_info> segments;
        while (total_len < bytes_to_remove) {
            segments.clear();
            total_len = 0;
            size_t run_len = 0;
            size_t total_byte_usage = 0;
            for (size_t i = 0; i < fstats.dict_usage.size(); i++) {
                if (fstats.dict_usage[i] <= freq_threshold) {
                    run_len++;
                    total_byte_usage += fstats.dict_usage[i];
                }
                else {
                    if (run_len >= t_length_threshold) {
                        auto seg_start = i - run_len;
                        segments.emplace_back(seg_start, run_len, total_byte_usage, 0);
                        total_len += run_len;
                    }
                    run_len = 0;
                    total_byte_usage = 0;
                }
            }
            LOG(INFO) << "Freq threshold = " << freq_threshold << " Found bytes = " << total_len << " Req = " << bytes_to_remove;
            freq_threshold *= 2;
        }
        LOG(INFO) << "\t"
                  << "Freq threshold = " << freq_threshold / 2 << " Length threshold = " << t_length_threshold << " Found bytes = " << total_len;
        LOG(INFO) << "\t"
                  << "Found " << segments.size() << " segments of total length " << total_len << " (" << total_len / (1024 * 1024) << " MiB)";

        /* (3) compute the metric for those segments */
        {
            LOG(INFO) << "Create/Load dictionary index";
            t_dict_idx idx(col, rebuild);
            const sdsl::int_vector_mapper<8, std::ios_base::in> dict(col.file_map[KEY_DICT]);
            for (size_t i = 0; i < segments.size(); i++) {
                compute_nfac(idx, dict, segments[i]);
            }
        }
        /* (3) sort by method */
        LOG(INFO) << "Sort segments by weight";
        if (t_method == FF) {
            /* FF */
            std::sort(segments.begin(), segments.end(), [](const segment_info& a, const segment_info& b) {
                double score_a = (double)a.num_factors_req * ((double)a.total_byte_usage / (double)a.length);
                double score_b = (double)b.num_factors_req * ((double)b.total_byte_usage / (double)b.length);
                return score_a < score_b;
            });
        }
        else {
            /* FFT */
            std::sort(segments.begin(), segments.end(), [](const segment_info& a, const segment_info& b) {
                double score_a = (double)a.num_factors_req * ((double)a.total_byte_usage / (double)a.length);
                score_a = score_a / (double)a.length;
                double score_b = (double)b.num_factors_req * ((double)b.total_byte_usage / (double)b.length);
                score_b = score_b / (double)b.length;
                return score_a < score_b;
            });
        }

        size_t segments_to_remove = 0;
        size_t segment_cum_len = 0;
        for (size_t i = 0; i < segments.size(); i++) {
            segments_to_remove++;
            segment_cum_len += segments[i].length;
            if (segment_cum_len >= bytes_to_remove) {
                // update the length so it fits into the size we want
                auto new_len = segment_cum_len - bytes_to_remove;
                segments[i].length -= (new_len + 1);
                break;
            }
        }
        segments.resize(segments_to_remove);
        LOG(INFO) << "Selected " << segments_to_remove << " for removal";

        LOG(INFO) << "Creating pruned dictionary";
        LOG(INFO) << "Sorting segments into offset order";
        std::sort(segments.begin(), segments.end(), [](const segment_info& a, const segment_info& b) {
            return a.offset < b.offset;
        });
        {
            const sdsl::int_vector_mapper<8, std::ios_base::in> dict(col.file_map[KEY_DICT]);
            auto wdict = sdsl::write_out_buffer<8>::create(new_dict_file);
            size_t cur_segment = 0;
            for (size_t i = 0; i < dict.size() - 2;) {
                if (segments[cur_segment].offset == i) {
                    /* skip the segment */
                    i += segments[cur_segment].length;
                    cur_segment++;
                }
                else {
                    wdict.push_back(dict[i]);
                    i++;
                }
            }
            wdict.push_back(0);
            LOG(INFO) << "\t"
                      << "Pruned dictionary size = " << wdict.size() / (1024 * 1024) << " MiB";
        }

        col.file_map[KEY_DICT] = new_dict_file;
        auto end_total = hrclock::now();
        LOG(INFO) << "\n"
                  << "\t" << type() + " Total time = " << duration_cast<milliseconds>(end_total - start_total).count() / 1000.0f << " sec";
        col.compute_dict_hash();
    }