Пример #1
0
csa_wt<t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat>::csa_wt(cache_config& config)
{
    if (!cache_file_exists(key_trait<alphabet_type::int_width>::KEY_BWT, config)) {
        return;
    }
    {
        auto event = memory_monitor::event("construct csa-alpbabet");
        int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config));
        size_type n = bwt_buf.size();
        alphabet_type tmp_alphabet(bwt_buf, n);
        m_alphabet.swap(tmp_alphabet);
    }
    {
        auto event = memory_monitor::event("construct wavelet tree");
        int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config));
        size_type n = bwt_buf.size();
        wavelet_tree_type tmp_wt(bwt_buf, n);
        m_wavelet_tree.swap(tmp_wt);
    }
    {
        auto event = memory_monitor::event("sample SA");
        sa_sample_type tmp_sa_sample(config);
        m_sa_sample.swap(tmp_sa_sample);
    }
    {
        auto event = memory_monitor::event("sample ISA");
        isa_sample_type isa_s(config, &m_sa_sample);
        util::swap_support(m_isa_sample, isa_s, &m_sa_sample, &m_sa_sample);
    }
}
Пример #2
0
csa_sada<t_enc_vec, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat>::csa_sada(cache_config& config)
{
    create_buffer();
    if (!cache_file_exists(key_trait<alphabet_type::int_width>::KEY_BWT, config)) {
        return;
    }
    int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config));
    size_type n = bwt_buf.size();
    {
        auto event = memory_monitor::event("construct csa-alpbabet");
        alphabet_type tmp_alphabet(bwt_buf, n);
        m_alphabet.swap(tmp_alphabet);
    }

    int_vector<> cnt_chr(sigma, 0, bits::hi(n)+1);
    for (typename alphabet_type::sigma_type i=0; i < sigma; ++i) {
        cnt_chr[i] = C[i];
    }
    // calculate psi
    {
        auto event = memory_monitor::event("construct PSI");
        // TODO: move PSI construct into construct_PSI.hpp
        int_vector<> psi(n, 0, bits::hi(n)+1);
        for (size_type i=0; i < n; ++i) {
            psi[ cnt_chr[ char2comp[bwt_buf[i]] ]++ ] = i;
        }
        std::string psi_file = cache_file_name(conf::KEY_PSI, config);
        if (!store_to_cache(psi, conf::KEY_PSI, config)) {
            return;
        }
    }
    {
        auto event = memory_monitor::event("encode PSI");
        int_vector_buffer<> psi_buf(cache_file_name(conf::KEY_PSI, config));
        t_enc_vec tmp_psi(psi_buf);
        m_psi.swap(tmp_psi);
    }
    {
        auto event = memory_monitor::event("sample SA");
        sa_sample_type tmp_sa_sample(config);
        m_sa_sample.swap(tmp_sa_sample);
    }
    {
        auto event = memory_monitor::event("sample ISA");
        isa_sample_type isa_s(config, &m_sa_sample);
        util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (const sa_sample_type*)nullptr);
    }
}
Пример #3
0
        //! Constructor
        lcp_byte(cache_config& config) {
            std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
            int_vector_buffer<> lcp_buf(lcp_file);
            m_small_lcp = int_vector<8>(lcp_buf.size());
            size_type l=0, max_l=0, max_big_idx=0, big_sum=0;

            for (size_type i=0; i < m_small_lcp.size(); ++i) {
                if ((l=lcp_buf[i]) < 255) {
                    m_small_lcp[i] = l;
                } else {
                    m_small_lcp[i] = 255;
                    if (l > max_l) max_l = l;
                    max_big_idx = i;
                    ++big_sum;
                }
            }
            m_big_lcp     = int_vector<>(big_sum, 0, bits::hi(max_l)+1);
            m_big_lcp_idx = int_vector<>(big_sum, 0, bits::hi(max_big_idx)+1);

            for (size_type i=0,ii=0; i<m_small_lcp.size(); ++i) {
                if ((l=lcp_buf[i]) >= 255) {
                    m_big_lcp[ii] = l;
                    m_big_lcp_idx[ii] = i;
                    ++ii;
                }
            }
        }
Пример #4
0
void register_cache_file(const char* key, cache_config &config){
	std::string file_name = cache_file_name(key, config);
	std::ifstream in(file_name.c_str());
	if ( in ){ // if file exists, register it.
		config.file_map[std::string(key)] = file_name;
	}
}
Пример #5
0
void construct(matching_index<t_wt, t_bv>& idx, const std::string& file, sdsl::cache_config& config, uint8_t num_bytes)
{
    sdsl::int_vector<0> text;
    {
        //auto event = memory_monitor::event("text");
        load_vector_from_file(text, file, num_bytes);
    }
    sdsl::csa_wt<sdsl::wt_int<>> csa;
    {
        //auto event = memory_monitor::event("csa");
        construct(csa, file, config, num_bytes);
    }
    t_wt wts;
    {
        //auto event = memory_monitor::event("wt");
        construct(wts, cache_file_name(sdsl::conf::KEY_SA, config));
    }

    sdsl::util::delete_all_files(config.file_map);

    {
        //auto event = memory_monitor::event("compose"); // contains rank support initialization
        idx = std::move(matching_index<t_wt, t_bv>(text, wts));
    }
}
Пример #6
0
 //! Constructor taking a cache_config
 lcp_bitcompressed(cache_config& config) {
     std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
     int_vector_buffer<> lcp_buf(lcp_file);
     m_lcp = int_vector<t_width>(lcp_buf.size(), 0, lcp_buf.width());
     for (size_type i=0; i < m_lcp.size(); ++i) {
         m_lcp[i] = lcp_buf[i];
     }
 }
Пример #7
0
void construct_lcp_PHI(cache_config& config)
{
    static_assert(t_width == 0 or t_width == 8 , "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
    typedef int_vector<>::size_type size_type;
    typedef int_vector<t_width> text_type;
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    int_vector_buffer<> sa_buf(config.file_map[conf::KEY_SA]);
    size_type n = sa_buf.size();

    assert(n > 0);
    if (1 == n) {  // Handle special case: Input only the sentinel character.
        int_vector<> lcp(1, 0);
        store_to_cache(lcp, conf::KEY_LCP, config);
        return;
    }

//	(1) Calculate PHI (stored in array plcp)
    int_vector<> plcp(n, 0, sa_buf.width());
    for (size_type i=0, sai_1 = 0; i < n; ++i) {
        size_type sai = sa_buf[i];
        plcp[ sai ] = sai_1;
        sai_1 = sai;
    }

//  (2) Load text from disk
    text_type text;
    load_from_cache(text, KEY_TEXT, config);

//  (3) Calculate permuted LCP array (text order), called PLCP
    size_type max_l = 0;
    for (size_type i=0, l=0; i < n-1; ++i) {
        size_type phii = plcp[i];
        while (text[i+l] == text[phii+l]) {
            ++l;
        }
        plcp[i] = l;
        if (l) {
            max_l = std::max(max_l, l);
            --l;
        }
    }
    util::clear(text);
    uint8_t lcp_width = bits::hi(max_l)+1;

//	(4) Transform PLCP into LCP
    std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
    size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
    int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width);   // open buffer for lcp
    lcp_buf[0] = 0;
    sa_buf.buffersize(buffer_size);
    for (size_type i=1; i < n; ++i) {
        size_type sai = sa_buf[i];
        lcp_buf[i] = plcp[sai];
    }
    lcp_buf.close();
    register_cache_file(conf::KEY_LCP, config);
}
Пример #8
0
 //! Construct
 lcp_vlc(cache_config& config, std::string other_key="")
 {
     std::string lcp_key  = conf::KEY_LCP;
     if ("" != other_key) {
         lcp_key = other_key;
     }
     int_vector_buffer<> lcp_buf(cache_file_name(lcp_key, config));
     vlc_vec_type tmp_vec(lcp_buf);
     m_vec.swap(tmp_vec);
 }
Пример #9
0
    //! Constructor
    csa_bitcompressed(cache_config& config) {
        std::string text_file = cache_file_name(key_trait<alphabet_type::int_width>::KEY_TEXT,config);
        int_vector_buffer<alphabet_type::int_width> text_buf(text_file);
        int_vector_buffer<>  sa_buf(cache_file_name(conf::KEY_SA,config));
        size_type n = text_buf.size();
        {
            alphabet_type tmp_alphabet(text_buf, n);
            m_alphabet.swap(tmp_alphabet);
        }
        {
            sa_sample_type tmp_sample(config);
            m_sa.swap(tmp_sample);
        }
        set_isa_samples<csa_bitcompressed>(sa_buf, m_isa);

        if (!store_to_file(m_isa, cache_file_name(conf::KEY_ISA,config), true)) {
            throw std::ios_base::failure("#csa_bitcompressed: Cannot store ISA to file system!");
        } else {
            register_cache_file(conf::KEY_ISA, config);
        }
    }
Пример #10
0
        doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
            construct(m_csa_full, file_name, cconfig, num_bytes);

            const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
            std::string text_file = cache_file_name(KEY_TEXT, cconfig);

            bit_vector doc_border;
            construct_doc_border(text_file,doc_border);
            bit_vector::rank_1_type doc_border_rank(&doc_border);
            m_doc_cnt = doc_border_rank(doc_border.size());

            int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));
            {
                int_vector<> D;
                construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D);
                std::string d_file = cache_file_name("DARRAY", cconfig);
                store_to_file(D, d_file);
                util::clear(D);
                construct(m_wtd, d_file);
                sdsl::remove(d_file);
            }
        }
Пример #11
0
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, wt_tag)
{
    auto event = memory_monitor::event("construct wavelet tree");
    int_vector<t_index::alphabet_category::WIDTH> text;
    load_vector_from_file(text, file, num_bytes);
    std::string tmp_key = util::to_string(util::pid())+"_"+util::to_string(util::id());
    std::string tmp_file_name = cache_file_name(tmp_key, config);
    store_to_file(text, tmp_file_name);
    util::clear(text);
    {
        int_vector_buffer<t_index::alphabet_category::WIDTH> text_buf(tmp_file_name);
        t_index tmp(text_buf, text_buf.size());
        idx.swap(tmp);
    }
    sdsl::remove(tmp_file_name);
}
Пример #12
0
void lcp_info(cache_config& config)
{
    typedef int_vector<>::size_type size_type;
    int_vector_buffer<> lcp_buf(cache_file_name(conf::KEY_LCP, config));
    size_type n = lcp_buf.size();

    size_type max_lcp = 0;
    size_type sum_lcp = 0;
    for (size_type i=0; i < n; ++i) {
        if (lcp_buf[i] > max_lcp)
            max_lcp = lcp_buf[i];
        sum_lcp += lcp_buf[i];
    }
    std::cout<<"# max lcp = " << max_lcp << std::endl;
    std::cout<<"# sum lcp = " << sum_lcp << std::endl;
    std::cout<<"# avg lcp = " << sum_lcp/(double)n << std::endl;
}
Пример #13
0
 //! Constructor
 lcp_wt(cache_config& config, std::string other_key="") {
     std::string temp_file = tmp_file(config, "_lcp_sml");
     std::string lcp_key  = conf::KEY_LCP;
     if ("" != other_key) {
         lcp_key = other_key;
     }
     int_vector_buffer<> lcp_buf(cache_file_name(lcp_key, config));
     size_type l=0, max_l=0, big_sum=0, n = lcp_buf.size();
     {
         int_vector<8> small_lcp = int_vector<8>(n);
         for (size_type i=0; i < n; ++i) {
             if ((l=lcp_buf[i]) < 255) {
                 small_lcp[i] = l;
             } else {
                 small_lcp[i] = 255;
                 if (l > max_l) max_l = l;
                 ++big_sum;
             }
         }
         store_to_file(small_lcp, temp_file);
     }
     {
         int_vector_buffer<8> lcp_sml_buf(temp_file);
         small_lcp_type tmp(lcp_sml_buf, lcp_sml_buf.size());
         m_small_lcp.swap(tmp);
     }
     sdsl::remove(temp_file);
     m_big_lcp = int_vector<>(big_sum, 0, bits::hi(max_l)+1);
     {
         for (size_type i=0, ii=0; i < n; ++i) {
             if (lcp_buf[i] >= 255) {
                 m_big_lcp[ ii++ ] = lcp_buf[i];
             }
         }
     }
 }
Пример #14
0
lcp_dac<t_b, t_rank>::lcp_dac(cache_config& config)
{
//  (1) Count for each level, how many blocks are needed for the representation
//      Running time: \f$ O(n \times \frac{\log n}{b}  \f$
//      Result is sorted in m_level_pointer_and_rank
    std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
    int_vector_buffer<> lcp_buf(lcp_file);
    size_type n = lcp_buf.size(), val=0;
    if (n == 0)
        return;
// initialize counter
    auto _size =  std::max(4*bits::hi(2), 2*(((bits::hi(n)+1)+t_b-1) / t_b));
    m_level_pointer_and_rank.resize(_size);
    for (size_type i=0; i < m_level_pointer_and_rank.size(); ++i)
        m_level_pointer_and_rank[i] = 0;
    m_level_pointer_and_rank[0] = n; // level 0 has n entries

    uint8_t level_x_2 = 0;
    for (size_type i=0; i < n; ++i) {
        val=lcp_buf[i];
        val >>= t_b; // shift value b bits to the right
        level_x_2 = 2;
        while (val) {
            // increase counter for current level by 1
            ++m_level_pointer_and_rank[level_x_2];
            val >>= t_b; // shift value b bits to the right
            level_x_2 += 2; // increase level by 1
        }
    }

//  (2)    Determine maximum level and prefix sums of level counters
    m_max_level = 0;
    size_type sum_blocks = 0, last_block_size=0;
    for (size_type i=0, t=0; i < m_level_pointer_and_rank.size(); i+=2) {
        t = sum_blocks;
        sum_blocks += m_level_pointer_and_rank[i];
        m_level_pointer_and_rank[i] = t;
        if (sum_blocks > t) {
            ++m_max_level;
            last_block_size = sum_blocks - t;
        }
    }
    m_overflow = bit_vector(sum_blocks - last_block_size, 0);
    m_data.resize(sum_blocks);

    assert(last_block_size > 0);

//  (3)    Enter block and overflow data
    int_vector<64> cnt = m_level_pointer_and_rank;
    const uint64_t mask = bits::lo_set[t_b];

    for (size_type i=0, j=0; i < n; ++i) {
        val=lcp_buf[i];
        j = cnt[0]++;
        m_data[ j ] =  val & mask;
        val >>= t_b; // shift value b bits to the right
        level_x_2 = 2;
        while (val) {
            m_overflow[j] = 1;
            // increase counter for current level by 1
            j = cnt[level_x_2]++;
            m_data[ j ] = val & mask;
            val >>= t_b; // shift value b bits to the right
            level_x_2 += 2; // increase level by 1
        }
    }

//  (4) Initialize rank data structure for m_overflow and precalc rank for
//      pointers
    util::init_support(m_overflow_rank, &m_overflow);
    for (size_type i=0; 2*i < m_level_pointer_and_rank.size() and
         m_level_pointer_and_rank[2*i] < m_overflow.size(); ++i) {
        m_level_pointer_and_rank[2*i+1] = m_overflow_rank(
                                              m_level_pointer_and_rank[2*i]);
    }
}
Пример #15
0
std::string CDDB::cddb_cache_file() {
	return cache_dir() + Dir::DIR_SEP + cache_file_name();
}
Пример #16
0
void construct_lcp_PHI(cache_config& config)
{
    typedef int_vector<>::size_type size_type;
    typedef int_vector<t_width> text_type;
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    int_vector_file_buffer<> sa_buf(config.file_map[constants::KEY_SA]);
    size_type n = sa_buf.int_vector_size;

    assert(n > 0);
    if (1 == n) {  // Handle special case: Input only the sentinel character.
        int_vector<> lcp(1, 0);
        store_to_cache(lcp, constants::KEY_LCP, config);
        return;
    }

//	(1) Calculate PHI (stored in array plcp)
    int_vector<> plcp(n, 0, sa_buf.width);
    for (size_type i=0, r_sum=0, r=sa_buf.load_next_block(), sai_1 = 0; r_sum < n;) {
        for (; i < r_sum+r; ++i) {
            size_type sai = sa_buf[i-r_sum];
            plcp[ sai ] = sai_1;
            sai_1 = sai;
        }
        r_sum += r; r = sa_buf.load_next_block();
    }

//  (2) Load text from disk
    text_type text;
    load_from_cache(text, KEY_TEXT, config);

//  (3) Calculate permuted LCP array (text order), called PLCP
    size_type max_l = 0;
    for (size_type i=0, l=0; i < n-1; ++i) {
        size_type phii = plcp[i];
        while (text[i+l] == text[phii+l]) {
            ++l;
        }
        plcp[i] = l;
        if (l) {
            max_l = std::max(max_l, l);
            --l;
        }
    }
    util::clear(text);
    uint8_t lcp_width = bits::hi(max_l)+1;

//	(4) Transform PLCP into LCP
    std::string lcp_file = cache_file_name(constants::KEY_LCP, config);
    osfstream lcp_out_buf(lcp_file, std::ios::binary | std::ios::app | std::ios::out);   // open buffer for lcp

    size_type bit_size = n*lcp_width;
    lcp_out_buf.write((char*) &(bit_size), sizeof(bit_size));	// write size of vector
    lcp_out_buf.write((char*) &(lcp_width),sizeof(lcp_width));  // write int_width of vector
    size_type wb = 0;  // bytes written into lcp int_vector

    size_type buffer_size = 1000000; // buffer_size is a multiple of 8!

    int_vector<> lcp_buf(buffer_size, 0, lcp_width);
    lcp_buf[0] = 0;
    sa_buf.reset(buffer_size);
    size_type r = 0;// sa_buf.load_next_block();
    for (size_type i=1, r_sum=0; r_sum < n;) {
        for (; i < r_sum+r; ++i) {
            size_type sai = sa_buf[i-r_sum];
            lcp_buf[ i-r_sum ] = plcp[sai];
        }
        if (r > 0) {
            size_type cur_wb = (r*lcp_buf.width()+7)/8;
            lcp_out_buf.write((const char*)lcp_buf.data(), cur_wb);
            wb += cur_wb;
        }
        r_sum += r; r = sa_buf.load_next_block();
    }
    if (wb%8) {
        lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8);
    }
    lcp_out_buf.close();
    register_cache_file(constants::KEY_LCP, config);
}