void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, lcp_tag) { auto event = memory_monitor::event("construct compressed LCP"); const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; typedef int_vector<t_width> text_type; { // (2) check, if the longest common prefix array is cached auto event = memory_monitor::event("LCP"); if (!cache_file_exists(conf::KEY_LCP, config)) { { auto event = memory_monitor::event("parse input text"); // (1) check, if the text is cached if (!cache_file_exists(KEY_TEXT, config)) { text_type text; load_vector_from_file(text, file, num_bytes); if (contains_no_zero_symbol(text, file)) { append_zero_symbol(text); store_to_cache(text,KEY_TEXT, config); } } register_cache_file(KEY_TEXT, config); } { // (2) check, if the suffix array is cached auto event = memory_monitor::event("SA"); if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_width>(config); } register_cache_file(conf::KEY_SA, config); } if (t_width==8) { construct_lcp_semi_extern_PHI(config); } else { construct_lcp_PHI<t_width>(config); } } register_cache_file(conf::KEY_LCP, config); } { auto event = memory_monitor::event("compressed LCP"); t_index tmp(config); tmp.swap(idx); } if (config.delete_files) { auto event = memory_monitor::event("delete temporary files"); util::delete_all_files(config.file_map); } }
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, csa_tag) { auto event = memory_monitor::event("construct CSA"); const char* KEY_TEXT = key_text_trait<t_index::alphabet_category::WIDTH>::KEY_TEXT; const char* KEY_BWT = key_bwt_trait<t_index::alphabet_category::WIDTH>::KEY_BWT; typedef int_vector<t_index::alphabet_category::WIDTH> text_type; { auto event = memory_monitor::event("parse input text"); // (1) check, if the text is cached if (!cache_file_exists(KEY_TEXT, config)) { text_type text; load_vector_from_file(text, file, num_bytes); if (contains_no_zero_symbol(text, file)) { append_zero_symbol(text); store_to_cache(text,KEY_TEXT, config); } } register_cache_file(KEY_TEXT, config); } { // (2) check, if the suffix array is cached auto event = memory_monitor::event("SA"); if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_index::alphabet_category::WIDTH>(config); } register_cache_file(conf::KEY_SA, config); } { // (3) construct BWT auto event = memory_monitor::event("BWT"); if (!cache_file_exists(KEY_BWT, config)) { construct_bwt<t_index::alphabet_category::WIDTH>(config); } register_cache_file(KEY_BWT, config); } { // (4) use BWT to construct the CSA auto event = memory_monitor::event("construct CSA"); t_index tmp(config); idx.swap(tmp); } if (config.delete_files) { auto event = memory_monitor::event("delete temporary files"); util::delete_all_files(config.file_map); } }
void construct_lcp_PHI(cache_config& config) { static_assert(t_width == 0 or t_width == 8 , "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet"); typedef int_vector<>::size_type size_type; typedef int_vector<t_width> text_type; const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; int_vector_buffer<> sa_buf(config.file_map[conf::KEY_SA]); size_type n = sa_buf.size(); assert(n > 0); if (1 == n) { // Handle special case: Input only the sentinel character. int_vector<> lcp(1, 0); store_to_cache(lcp, conf::KEY_LCP, config); return; } // (1) Calculate PHI (stored in array plcp) int_vector<> plcp(n, 0, sa_buf.width()); for (size_type i=0, sai_1 = 0; i < n; ++i) { size_type sai = sa_buf[i]; plcp[ sai ] = sai_1; sai_1 = sai; } // (2) Load text from disk text_type text; load_from_cache(text, KEY_TEXT, config); // (3) Calculate permuted LCP array (text order), called PLCP size_type max_l = 0; for (size_type i=0, l=0; i < n-1; ++i) { size_type phii = plcp[i]; while (text[i+l] == text[phii+l]) { ++l; } plcp[i] = l; if (l) { max_l = std::max(max_l, l); --l; } } util::clear(text); uint8_t lcp_width = bits::hi(max_l)+1; // (4) Transform PLCP into LCP std::string lcp_file = cache_file_name(conf::KEY_LCP, config); size_type buffer_size = 1000000; // buffer_size is a multiple of 8! int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp lcp_buf[0] = 0; sa_buf.buffersize(buffer_size); for (size_type i=1; i < n; ++i) { size_type sai = sa_buf[i]; lcp_buf[i] = plcp[sai]; } lcp_buf.close(); register_cache_file(conf::KEY_LCP, config); }
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, cst_tag) { auto event = memory_monitor::event("construct CST"); const char* KEY_TEXT = key_text_trait<t_index::alphabet_category::WIDTH>::KEY_TEXT; const char* KEY_BWT = key_bwt_trait<t_index::alphabet_category::WIDTH>::KEY_BWT; csa_tag csa_t; { // (1) check, if the compressed suffix array is cached typename t_index::csa_type csa; if (!cache_file_exists(std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config)) { cache_config csa_config(false, config.dir, config.id, config.file_map); construct(csa, file, csa_config, num_bytes, csa_t); auto event = memory_monitor::event("store CSA"); config.file_map = csa_config.file_map; store_to_cache(csa,std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config); } register_cache_file(std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config); } { // (2) check, if the longest common prefix array is cached auto event = memory_monitor::event("LCP"); register_cache_file(KEY_TEXT, config); register_cache_file(KEY_BWT, config); register_cache_file(conf::KEY_SA, config); if (!cache_file_exists(conf::KEY_LCP, config)) { if (t_index::alphabet_category::WIDTH==8) { construct_lcp_semi_extern_PHI(config); } else { construct_lcp_PHI<t_index::alphabet_category::WIDTH>(config); } } register_cache_file(conf::KEY_LCP, config); } { auto event = memory_monitor::event("CST"); t_index tmp(config); tmp.swap(idx); } if (config.delete_files) { auto event = memory_monitor::event("delete temporary files"); util::delete_all_files(config.file_map); } }
//! Constructor csa_bitcompressed(cache_config& config) { std::string text_file = cache_file_name(key_trait<alphabet_type::int_width>::KEY_TEXT,config); int_vector_buffer<alphabet_type::int_width> text_buf(text_file); int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA,config)); size_type n = text_buf.size(); { alphabet_type tmp_alphabet(text_buf, n); m_alphabet.swap(tmp_alphabet); } { sa_sample_type tmp_sample(config); m_sa.swap(tmp_sample); } set_isa_samples<csa_bitcompressed>(sa_buf, m_isa); if (!store_to_file(m_isa, cache_file_name(conf::KEY_ISA,config), true)) { throw std::ios_base::failure("#csa_bitcompressed: Cannot store ISA to file system!"); } else { register_cache_file(conf::KEY_ISA, config); } }
void construct_lcp_PHI(cache_config& config) { typedef int_vector<>::size_type size_type; typedef int_vector<t_width> text_type; const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; int_vector_file_buffer<> sa_buf(config.file_map[constants::KEY_SA]); size_type n = sa_buf.int_vector_size; assert(n > 0); if (1 == n) { // Handle special case: Input only the sentinel character. int_vector<> lcp(1, 0); store_to_cache(lcp, constants::KEY_LCP, config); return; } // (1) Calculate PHI (stored in array plcp) int_vector<> plcp(n, 0, sa_buf.width); for (size_type i=0, r_sum=0, r=sa_buf.load_next_block(), sai_1 = 0; r_sum < n;) { for (; i < r_sum+r; ++i) { size_type sai = sa_buf[i-r_sum]; plcp[ sai ] = sai_1; sai_1 = sai; } r_sum += r; r = sa_buf.load_next_block(); } // (2) Load text from disk text_type text; load_from_cache(text, KEY_TEXT, config); // (3) Calculate permuted LCP array (text order), called PLCP size_type max_l = 0; for (size_type i=0, l=0; i < n-1; ++i) { size_type phii = plcp[i]; while (text[i+l] == text[phii+l]) { ++l; } plcp[i] = l; if (l) { max_l = std::max(max_l, l); --l; } } util::clear(text); uint8_t lcp_width = bits::hi(max_l)+1; // (4) Transform PLCP into LCP std::string lcp_file = cache_file_name(constants::KEY_LCP, config); osfstream lcp_out_buf(lcp_file, std::ios::binary | std::ios::app | std::ios::out); // open buffer for lcp size_type bit_size = n*lcp_width; lcp_out_buf.write((char*) &(bit_size), sizeof(bit_size)); // write size of vector lcp_out_buf.write((char*) &(lcp_width),sizeof(lcp_width)); // write int_width of vector size_type wb = 0; // bytes written into lcp int_vector size_type buffer_size = 1000000; // buffer_size is a multiple of 8! int_vector<> lcp_buf(buffer_size, 0, lcp_width); lcp_buf[0] = 0; sa_buf.reset(buffer_size); size_type r = 0;// sa_buf.load_next_block(); for (size_type i=1, r_sum=0; r_sum < n;) { for (; i < r_sum+r; ++i) { size_type sai = sa_buf[i-r_sum]; lcp_buf[ i-r_sum ] = plcp[sai]; } if (r > 0) { size_type cur_wb = (r*lcp_buf.width()+7)/8; lcp_out_buf.write((const char*)lcp_buf.data(), cur_wb); wb += cur_wb; } r_sum += r; r = sa_buf.load_next_block(); } if (wb%8) { lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8); } lcp_out_buf.close(); register_cache_file(constants::KEY_LCP, config); }