csa_wt<t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat>::csa_wt(cache_config& config) { if (!cache_file_exists(key_trait<alphabet_type::int_width>::KEY_BWT, config)) { return; } { auto event = memory_monitor::event("construct csa-alpbabet"); int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config)); size_type n = bwt_buf.size(); alphabet_type tmp_alphabet(bwt_buf, n); m_alphabet.swap(tmp_alphabet); } { auto event = memory_monitor::event("construct wavelet tree"); int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config)); size_type n = bwt_buf.size(); wavelet_tree_type tmp_wt(bwt_buf, n); m_wavelet_tree.swap(tmp_wt); } { auto event = memory_monitor::event("sample SA"); sa_sample_type tmp_sa_sample(config); m_sa_sample.swap(tmp_sa_sample); } { auto event = memory_monitor::event("sample ISA"); isa_sample_type isa_s(config, &m_sa_sample); util::swap_support(m_isa_sample, isa_s, &m_sa_sample, &m_sa_sample); } }
csa_sada<t_enc_vec, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat>::csa_sada(cache_config& config) { create_buffer(); if (!cache_file_exists(key_trait<alphabet_type::int_width>::KEY_BWT, config)) { return; } int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config)); size_type n = bwt_buf.size(); { auto event = memory_monitor::event("construct csa-alpbabet"); alphabet_type tmp_alphabet(bwt_buf, n); m_alphabet.swap(tmp_alphabet); } int_vector<> cnt_chr(sigma, 0, bits::hi(n)+1); for (typename alphabet_type::sigma_type i=0; i < sigma; ++i) { cnt_chr[i] = C[i]; } // calculate psi { auto event = memory_monitor::event("construct PSI"); // TODO: move PSI construct into construct_PSI.hpp int_vector<> psi(n, 0, bits::hi(n)+1); for (size_type i=0; i < n; ++i) { psi[ cnt_chr[ char2comp[bwt_buf[i]] ]++ ] = i; } std::string psi_file = cache_file_name(conf::KEY_PSI, config); if (!store_to_cache(psi, conf::KEY_PSI, config)) { return; } } { auto event = memory_monitor::event("encode PSI"); int_vector_buffer<> psi_buf(cache_file_name(conf::KEY_PSI, config)); t_enc_vec tmp_psi(psi_buf); m_psi.swap(tmp_psi); } { auto event = memory_monitor::event("sample SA"); sa_sample_type tmp_sa_sample(config); m_sa_sample.swap(tmp_sa_sample); } { auto event = memory_monitor::event("sample ISA"); isa_sample_type isa_s(config, &m_sa_sample); util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (const sa_sample_type*)nullptr); } }
//! Constructor lcp_byte(cache_config& config) { std::string lcp_file = cache_file_name(conf::KEY_LCP, config); int_vector_buffer<> lcp_buf(lcp_file); m_small_lcp = int_vector<8>(lcp_buf.size()); size_type l=0, max_l=0, max_big_idx=0, big_sum=0; for (size_type i=0; i < m_small_lcp.size(); ++i) { if ((l=lcp_buf[i]) < 255) { m_small_lcp[i] = l; } else { m_small_lcp[i] = 255; if (l > max_l) max_l = l; max_big_idx = i; ++big_sum; } } m_big_lcp = int_vector<>(big_sum, 0, bits::hi(max_l)+1); m_big_lcp_idx = int_vector<>(big_sum, 0, bits::hi(max_big_idx)+1); for (size_type i=0,ii=0; i<m_small_lcp.size(); ++i) { if ((l=lcp_buf[i]) >= 255) { m_big_lcp[ii] = l; m_big_lcp_idx[ii] = i; ++ii; } } }
void register_cache_file(const char* key, cache_config &config){ std::string file_name = cache_file_name(key, config); std::ifstream in(file_name.c_str()); if ( in ){ // if file exists, register it. config.file_map[std::string(key)] = file_name; } }
void construct(matching_index<t_wt, t_bv>& idx, const std::string& file, sdsl::cache_config& config, uint8_t num_bytes) { sdsl::int_vector<0> text; { //auto event = memory_monitor::event("text"); load_vector_from_file(text, file, num_bytes); } sdsl::csa_wt<sdsl::wt_int<>> csa; { //auto event = memory_monitor::event("csa"); construct(csa, file, config, num_bytes); } t_wt wts; { //auto event = memory_monitor::event("wt"); construct(wts, cache_file_name(sdsl::conf::KEY_SA, config)); } sdsl::util::delete_all_files(config.file_map); { //auto event = memory_monitor::event("compose"); // contains rank support initialization idx = std::move(matching_index<t_wt, t_bv>(text, wts)); } }
//! Constructor taking a cache_config lcp_bitcompressed(cache_config& config) { std::string lcp_file = cache_file_name(conf::KEY_LCP, config); int_vector_buffer<> lcp_buf(lcp_file); m_lcp = int_vector<t_width>(lcp_buf.size(), 0, lcp_buf.width()); for (size_type i=0; i < m_lcp.size(); ++i) { m_lcp[i] = lcp_buf[i]; } }
void construct_lcp_PHI(cache_config& config) { static_assert(t_width == 0 or t_width == 8 , "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet"); typedef int_vector<>::size_type size_type; typedef int_vector<t_width> text_type; const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; int_vector_buffer<> sa_buf(config.file_map[conf::KEY_SA]); size_type n = sa_buf.size(); assert(n > 0); if (1 == n) { // Handle special case: Input only the sentinel character. int_vector<> lcp(1, 0); store_to_cache(lcp, conf::KEY_LCP, config); return; } // (1) Calculate PHI (stored in array plcp) int_vector<> plcp(n, 0, sa_buf.width()); for (size_type i=0, sai_1 = 0; i < n; ++i) { size_type sai = sa_buf[i]; plcp[ sai ] = sai_1; sai_1 = sai; } // (2) Load text from disk text_type text; load_from_cache(text, KEY_TEXT, config); // (3) Calculate permuted LCP array (text order), called PLCP size_type max_l = 0; for (size_type i=0, l=0; i < n-1; ++i) { size_type phii = plcp[i]; while (text[i+l] == text[phii+l]) { ++l; } plcp[i] = l; if (l) { max_l = std::max(max_l, l); --l; } } util::clear(text); uint8_t lcp_width = bits::hi(max_l)+1; // (4) Transform PLCP into LCP std::string lcp_file = cache_file_name(conf::KEY_LCP, config); size_type buffer_size = 1000000; // buffer_size is a multiple of 8! int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp lcp_buf[0] = 0; sa_buf.buffersize(buffer_size); for (size_type i=1; i < n; ++i) { size_type sai = sa_buf[i]; lcp_buf[i] = plcp[sai]; } lcp_buf.close(); register_cache_file(conf::KEY_LCP, config); }
//! Construct lcp_vlc(cache_config& config, std::string other_key="") { std::string lcp_key = conf::KEY_LCP; if ("" != other_key) { lcp_key = other_key; } int_vector_buffer<> lcp_buf(cache_file_name(lcp_key, config)); vlc_vec_type tmp_vec(lcp_buf); m_vec.swap(tmp_vec); }
//! Constructor csa_bitcompressed(cache_config& config) { std::string text_file = cache_file_name(key_trait<alphabet_type::int_width>::KEY_TEXT,config); int_vector_buffer<alphabet_type::int_width> text_buf(text_file); int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA,config)); size_type n = text_buf.size(); { alphabet_type tmp_alphabet(text_buf, n); m_alphabet.swap(tmp_alphabet); } { sa_sample_type tmp_sample(config); m_sa.swap(tmp_sample); } set_isa_samples<csa_bitcompressed>(sa_buf, m_isa); if (!store_to_file(m_isa, cache_file_name(conf::KEY_ISA,config), true)) { throw std::ios_base::failure("#csa_bitcompressed: Cannot store ISA to file system!"); } else { register_cache_file(conf::KEY_ISA, config); } }
doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) { construct(m_csa_full, file_name, cconfig, num_bytes); const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT; std::string text_file = cache_file_name(KEY_TEXT, cconfig); bit_vector doc_border; construct_doc_border(text_file,doc_border); bit_vector::rank_1_type doc_border_rank(&doc_border); m_doc_cnt = doc_border_rank(doc_border.size()); int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig)); { int_vector<> D; construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D); std::string d_file = cache_file_name("DARRAY", cconfig); store_to_file(D, d_file); util::clear(D); construct(m_wtd, d_file); sdsl::remove(d_file); } }
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, wt_tag) { auto event = memory_monitor::event("construct wavelet tree"); int_vector<t_index::alphabet_category::WIDTH> text; load_vector_from_file(text, file, num_bytes); std::string tmp_key = util::to_string(util::pid())+"_"+util::to_string(util::id()); std::string tmp_file_name = cache_file_name(tmp_key, config); store_to_file(text, tmp_file_name); util::clear(text); { int_vector_buffer<t_index::alphabet_category::WIDTH> text_buf(tmp_file_name); t_index tmp(text_buf, text_buf.size()); idx.swap(tmp); } sdsl::remove(tmp_file_name); }
void lcp_info(cache_config& config) { typedef int_vector<>::size_type size_type; int_vector_buffer<> lcp_buf(cache_file_name(conf::KEY_LCP, config)); size_type n = lcp_buf.size(); size_type max_lcp = 0; size_type sum_lcp = 0; for (size_type i=0; i < n; ++i) { if (lcp_buf[i] > max_lcp) max_lcp = lcp_buf[i]; sum_lcp += lcp_buf[i]; } std::cout<<"# max lcp = " << max_lcp << std::endl; std::cout<<"# sum lcp = " << sum_lcp << std::endl; std::cout<<"# avg lcp = " << sum_lcp/(double)n << std::endl; }
//! Constructor lcp_wt(cache_config& config, std::string other_key="") { std::string temp_file = tmp_file(config, "_lcp_sml"); std::string lcp_key = conf::KEY_LCP; if ("" != other_key) { lcp_key = other_key; } int_vector_buffer<> lcp_buf(cache_file_name(lcp_key, config)); size_type l=0, max_l=0, big_sum=0, n = lcp_buf.size(); { int_vector<8> small_lcp = int_vector<8>(n); for (size_type i=0; i < n; ++i) { if ((l=lcp_buf[i]) < 255) { small_lcp[i] = l; } else { small_lcp[i] = 255; if (l > max_l) max_l = l; ++big_sum; } } store_to_file(small_lcp, temp_file); } { int_vector_buffer<8> lcp_sml_buf(temp_file); small_lcp_type tmp(lcp_sml_buf, lcp_sml_buf.size()); m_small_lcp.swap(tmp); } sdsl::remove(temp_file); m_big_lcp = int_vector<>(big_sum, 0, bits::hi(max_l)+1); { for (size_type i=0, ii=0; i < n; ++i) { if (lcp_buf[i] >= 255) { m_big_lcp[ ii++ ] = lcp_buf[i]; } } } }
lcp_dac<t_b, t_rank>::lcp_dac(cache_config& config) { // (1) Count for each level, how many blocks are needed for the representation // Running time: \f$ O(n \times \frac{\log n}{b} \f$ // Result is sorted in m_level_pointer_and_rank std::string lcp_file = cache_file_name(conf::KEY_LCP, config); int_vector_buffer<> lcp_buf(lcp_file); size_type n = lcp_buf.size(), val=0; if (n == 0) return; // initialize counter auto _size = std::max(4*bits::hi(2), 2*(((bits::hi(n)+1)+t_b-1) / t_b)); m_level_pointer_and_rank.resize(_size); for (size_type i=0; i < m_level_pointer_and_rank.size(); ++i) m_level_pointer_and_rank[i] = 0; m_level_pointer_and_rank[0] = n; // level 0 has n entries uint8_t level_x_2 = 0; for (size_type i=0; i < n; ++i) { val=lcp_buf[i]; val >>= t_b; // shift value b bits to the right level_x_2 = 2; while (val) { // increase counter for current level by 1 ++m_level_pointer_and_rank[level_x_2]; val >>= t_b; // shift value b bits to the right level_x_2 += 2; // increase level by 1 } } // (2) Determine maximum level and prefix sums of level counters m_max_level = 0; size_type sum_blocks = 0, last_block_size=0; for (size_type i=0, t=0; i < m_level_pointer_and_rank.size(); i+=2) { t = sum_blocks; sum_blocks += m_level_pointer_and_rank[i]; m_level_pointer_and_rank[i] = t; if (sum_blocks > t) { ++m_max_level; last_block_size = sum_blocks - t; } } m_overflow = bit_vector(sum_blocks - last_block_size, 0); m_data.resize(sum_blocks); assert(last_block_size > 0); // (3) Enter block and overflow data int_vector<64> cnt = m_level_pointer_and_rank; const uint64_t mask = bits::lo_set[t_b]; for (size_type i=0, j=0; i < n; ++i) { val=lcp_buf[i]; j = cnt[0]++; m_data[ j ] = val & mask; val >>= t_b; // shift value b bits to the right level_x_2 = 2; while (val) { m_overflow[j] = 1; // increase counter for current level by 1 j = cnt[level_x_2]++; m_data[ j ] = val & mask; val >>= t_b; // shift value b bits to the right level_x_2 += 2; // increase level by 1 } } // (4) Initialize rank data structure for m_overflow and precalc rank for // pointers util::init_support(m_overflow_rank, &m_overflow); for (size_type i=0; 2*i < m_level_pointer_and_rank.size() and m_level_pointer_and_rank[2*i] < m_overflow.size(); ++i) { m_level_pointer_and_rank[2*i+1] = m_overflow_rank( m_level_pointer_and_rank[2*i]); } }
std::string CDDB::cddb_cache_file() { return cache_dir() + Dir::DIR_SEP + cache_file_name(); }
void construct_lcp_PHI(cache_config& config) { typedef int_vector<>::size_type size_type; typedef int_vector<t_width> text_type; const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; int_vector_file_buffer<> sa_buf(config.file_map[constants::KEY_SA]); size_type n = sa_buf.int_vector_size; assert(n > 0); if (1 == n) { // Handle special case: Input only the sentinel character. int_vector<> lcp(1, 0); store_to_cache(lcp, constants::KEY_LCP, config); return; } // (1) Calculate PHI (stored in array plcp) int_vector<> plcp(n, 0, sa_buf.width); for (size_type i=0, r_sum=0, r=sa_buf.load_next_block(), sai_1 = 0; r_sum < n;) { for (; i < r_sum+r; ++i) { size_type sai = sa_buf[i-r_sum]; plcp[ sai ] = sai_1; sai_1 = sai; } r_sum += r; r = sa_buf.load_next_block(); } // (2) Load text from disk text_type text; load_from_cache(text, KEY_TEXT, config); // (3) Calculate permuted LCP array (text order), called PLCP size_type max_l = 0; for (size_type i=0, l=0; i < n-1; ++i) { size_type phii = plcp[i]; while (text[i+l] == text[phii+l]) { ++l; } plcp[i] = l; if (l) { max_l = std::max(max_l, l); --l; } } util::clear(text); uint8_t lcp_width = bits::hi(max_l)+1; // (4) Transform PLCP into LCP std::string lcp_file = cache_file_name(constants::KEY_LCP, config); osfstream lcp_out_buf(lcp_file, std::ios::binary | std::ios::app | std::ios::out); // open buffer for lcp size_type bit_size = n*lcp_width; lcp_out_buf.write((char*) &(bit_size), sizeof(bit_size)); // write size of vector lcp_out_buf.write((char*) &(lcp_width),sizeof(lcp_width)); // write int_width of vector size_type wb = 0; // bytes written into lcp int_vector size_type buffer_size = 1000000; // buffer_size is a multiple of 8! int_vector<> lcp_buf(buffer_size, 0, lcp_width); lcp_buf[0] = 0; sa_buf.reset(buffer_size); size_type r = 0;// sa_buf.load_next_block(); for (size_type i=1, r_sum=0; r_sum < n;) { for (; i < r_sum+r; ++i) { size_type sai = sa_buf[i-r_sum]; lcp_buf[ i-r_sum ] = plcp[sai]; } if (r > 0) { size_type cur_wb = (r*lcp_buf.width()+7)/8; lcp_out_buf.write((const char*)lcp_buf.data(), cur_wb); wb += cur_wb; } r_sum += r; r = sa_buf.load_next_block(); } if (wb%8) { lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8); } lcp_out_buf.close(); register_cache_file(constants::KEY_LCP, config); }