/*! * \param text_buf Byte stream. * \param len Length of the byte stream. */ succinct_byte_alphabet(int_vector_buffer<8>& text_buf, int_vector_size_type len): char2comp(this), comp2char(this), C(m_C), sigma(m_sigma) { m_sigma = 0; if (0 == len or 0 == text_buf.size()) return; assert(len <= text_buf.size()); // initialize vectors int_vector<64> D(257, 0); bit_vector tmp_char(256, 0); // count occurrences of each symbol for (size_type i=0; i < len; ++i) { ++D[text_buf[i]]; } assert(1 == D[0]); // null-byte should occur exactly once m_sigma = 0; for (int i=0; i<256; ++i) if (D[i]) { tmp_char[i] = 1; // mark occurring character D[m_sigma] = D[i]; // compactify m_C ++m_sigma; } // resize to sigma+1, since CSAs also need the sum of all elements m_C = C_type(m_sigma+1, 0, bits::hi(len)+1); for (int i=(int)m_sigma; i > 0; --i) m_C[i] = D[i-1]; m_C[0] = 0; for (int i=1; i <= (int)m_sigma; ++i) m_C[i] = m_C[i] + m_C[i-1]; assert(m_C[sigma]==len); m_char = tmp_char; util::init_support(m_char_rank, &m_char); util::init_support(m_char_select, &m_char); }
void construct_D_array(int_vector_buffer<0>& sa_buf, bit_vector::rank_1_type& doc_border_rank, const size_type doc_cnt, int_vector<>& D) { D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt+1)+1); for (size_type i = 0; i < sa_buf.size(); ++i) { uint64_t d = doc_border_rank(sa_buf[i]+1); D[i] = d; } }
bit_vector::size_type construct_supercartesian_tree_bp_succinct_and_first_child( int_vector_buffer<t_width>& lcp_buf, bit_vector& bp, bit_vector& bp_fc, const bool minimum = true) { typedef bit_vector::size_type size_type; size_type n = lcp_buf.size(); bp.resize(2 * n); // resize bit vector for balanced parentheses to 2 n bits bp_fc.resize(n); if (n == 0) // if n == 0 we are done return 0; size_type fc_cnt = 0; // first child counter util::set_to_value(bp, 0); util::set_to_value(bp_fc, 0); sorted_multi_stack_support vec_stack(n); size_type k = 0; size_type k_fc = 0; // first child index if (minimum) { // no "lazy stack" trick used here for (size_type i = 0, x; i < n; ++i) { x = lcp_buf[i]; while (!vec_stack.empty() and x < vec_stack.top()) { if (vec_stack.pop()) { bp_fc[k_fc] = 1; ++fc_cnt; } ++k; // writing a closing parenthesis, bp is already initialized to zeros ++k_fc; // write a bit in first_child } vec_stack.push(x); bp[k++] = 1; // writing an opening parenthesis } } else { // no "lazy stack" trick used here for (size_type i = 0, x; i < n; ++i) { x = lcp_buf[i]; while (!vec_stack.empty() and x > vec_stack.top()) { if (vec_stack.pop()) { bp_fc[k_fc] = 1; ++fc_cnt; } ++k; // writing a closing parenthesis, bp is already initialized to zeros ++k_fc; // write a bit in first_child } vec_stack.push(x); bp[k++] = 1; // writing an opening parenthesis } } while (!vec_stack.empty()) { if (vec_stack.pop()) { bp_fc[k_fc] = 1; ++fc_cnt; } // writing a closing parenthesis in bp, not necessary as bp is initialized with zeros ++k; ++k_fc; } return fc_cnt; }
bit_vector construct_supercartesian_tree_bp_succinct(int_vector_buffer<t_width>& lcp_buf, const bool minimum = true) { typedef bit_vector::size_type size_type; bit_vector bp(2 * lcp_buf.size(), 0); // initialize result if (lcp_buf.size() > 0) { sorted_multi_stack_support vec_stack(lcp_buf.size()); size_type k = 0; if (minimum) { bp[k++] = 1; size_type last = lcp_buf[0]; for (size_type i = 1, x; i < lcp_buf.size(); ++i) { x = lcp_buf[i]; if (x < last) { ++k; // writing a closing parenthesis for last while (!vec_stack.empty() and x < vec_stack.top()) { vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros } } else { vec_stack.push(last); // "lazy stack" trick: speed-up about 25 % } bp[k++] = 1; // writing an opening parenthesis last = x; } } else { // no "lazy stack" trick use here for (size_type i = 0, x; i < lcp_buf.size(); ++i) { x = lcp_buf[i]; while (!vec_stack.empty() and x > vec_stack.top()) { vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros } vec_stack.push(x); bp[k++] = 1; // writing an opening parenthesis } } } return bp; }
/*! * \param text_buf Byte stream. * \param len Length of the byte stream. */ int_alphabet(int_vector_buffer<0>& text_buf, int_vector_size_type len): char2comp(this), comp2char(this), C(m_C), sigma(m_sigma) { m_sigma = 0; if (0 == len or 0 == text_buf.size()) return; assert(len <= text_buf.size()); // initialize vectors std::map<size_type, size_type> D; // count occurrences of each symbol for (size_type i=0; i < len; ++i) { D[text_buf[i]]++; } m_sigma = D.size(); if (is_continuous_alphabet(D)) { // do not initialize m_char, m_char_rank and m_char_select since we can map directly } else { // note: the alphabet has at least size 1, so the following is safe: size_type largest_symbol = (--D.end())->first; bit_vector tmp_char(largest_symbol+1, 0); for (std::map<size_type, size_type>::const_iterator it = D.begin(), end=D.end(); it != end; ++it) { tmp_char[it->first] = 1; } m_char = tmp_char; util::init_support(m_char_rank, &m_char); util::init_support(m_char_select, &m_char); } assert(D.find(0) != D.end() and 1 == D[0]); // null-byte should occur exactly once // resize to sigma+1, since CSAs also need the sum of all elements m_C = C_type(m_sigma+1, 0, bits::hi(len)+1); size_type sum = 0, idx=0; for (std::map<size_type, size_type>::const_iterator it = D.begin(), end=D.end(); it != end; ++it) { m_C[idx++] = sum; sum += it->second; } m_C[idx] = sum; // insert sum of all elements }
/*! \param text_buf A int_vector_buffer to the original text. * \param size The length of the prefix of the text, for which the wavelet tree should be build. */ wt_int_rlmn(int_vector_buffer<>& text_buf, size_type size):m_size(size), sigma(m_wt.sigma) { if (0 == text_buf.size() or 0 == size) return; int_vector<> condensed_bwt; { // scope for bl and bf bit_vector bl = bit_vector(size, 0); std::map<uint64_t, uint64_t> C; uint64_t last_c = 0; size_type runs = 0; for (size_type i=0; i < size; ++i) { uint64_t c = text_buf[i]; if (last_c != c or i==0) { bl[i] = 1; ++runs; } ++C[c]; last_c = c; } uint64_t max_symbol = (--C.end())->first; m_C = int_vector<>(max_symbol+1, 0, bits::hi(size)+1); for (size_type i=0, prefix_sum=0; i<=max_symbol; ++i) { m_C[i] = prefix_sum; prefix_sum += C[i]; } int_vector<> lf_map = m_C; bit_vector bf = bit_vector(size+1, 0); bf[size] = 1; // initialize last element condensed_bwt = int_vector<>(runs, 0, bits::hi(max_symbol)+1); runs = 0; for (size_type i=0; i < size; ++i) { uint64_t c = text_buf[i]; if (bl[i]) { bf[lf_map[c]] = 1; condensed_bwt[runs++] = c; } ++lf_map[c]; } { // TODO: remove absolute file name std::string temp_file = "tmp_wt_int_rlmn_" + util::to_string(util::pid()) + "_" + util::to_string(util::id()); store_to_file(condensed_bwt, temp_file); util::clear(condensed_bwt); int_vector_buffer<> temp_bwt_buf(temp_file); m_wt = std::move(wt_type(temp_bwt_buf, temp_bwt_buf.size())); temp_bwt_buf.close(true); } m_bl = std::move(bit_vector_type(bl)); m_bf = std::move(bit_vector_type(bf)); } util::init_support(m_bl_rank, &m_bl); util::init_support(m_bf_rank, &m_bf); util::init_support(m_bf_select, &m_bf); util::init_support(m_bl_select, &m_bl); m_C_bf_rank = int_vector<>(m_C.size(), 0, bits::hi(size)+1); for (size_type i=0; i<m_C.size(); ++i) { m_C_bf_rank[i] = m_bf_rank(m_C[i]); } }