byte_alphabet_strategy::byte_alphabet_strategy(int_vector_file_buffer<8> &text_buf, int_vector_size_type len): char2comp(m_char2comp), comp2char(m_comp2char), C(m_C), sigma(m_sigma) { m_sigma = 0; text_buf.reset(); if (0 == len or 0 == text_buf.int_vector_size) return; assert( len <= text_buf.int_vector_size ); // initialize vectors util::assign(m_C , int_vector<64>(257, 0)); util::assign(m_char2comp, int_vector<8>(256,0)); util::assign(m_comp2char, int_vector<8>(256,0)); // count occurrences of each symbol for (size_type i=0, r_sum=0, r = text_buf.load_next_block(); i < len;) { for (; i < r_sum+r; ++i) { ++m_C[text_buf[i-r_sum]]; } r_sum += r; r = text_buf.load_next_block(); } assert(1 == m_C[0]); // null-byte should occur exactly once m_sigma = 0; for (int i=0; i<256; ++i) if (m_C[i]) { m_char2comp[i] = m_sigma; m_comp2char[sigma] = i; m_C[m_sigma] = m_C[i]; ++m_sigma; } m_comp2char.resize(m_sigma); m_C.resize(m_sigma+1); for (int i=(int)m_sigma; i > 0; --i) m_C[i] = m_C[i-1]; m_C[0] = 0; for (int i=1; i <= (int)m_sigma; ++i) m_C[i] += m_C[i-1]; assert(C[sigma]==len); }
bit_vector::size_type construct_first_p_index(int_vector_file_buffer<fixedIntWidth>& lcp_buf, bit_vector& bp, const bool minimum=true) { typedef bit_vector::size_type size_type; size_type nr_of_first_indices = 0; lcp_buf.reset(); size_type n = lcp_buf.int_vector_size; bp = bit_vector(n, 0); sorted_multi_stack_support vec_stack(n); size_type k=0; if (minimum) { for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) { for (; i<r_sum+r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x < vec_stack.top()) { if (vec_stack.pop()) { bp[k] = 1; ++nr_of_first_indices; } ++k; } vec_stack.push(x); } r_sum += r; r = lcp_buf.load_next_block(); } } else { for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) { for (; i<r_sum+r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x > vec_stack.top()) { if (vec_stack.pop()) { bp[k] = 1; ++nr_of_first_indices; } ++k; } vec_stack.push(x); } r_sum += r; r = lcp_buf.load_next_block(); } } while (!vec_stack.empty()) { if (vec_stack.pop()) { bp[k] = 1; ++nr_of_first_indices; } ++k; } // assert( k == vec.size() ); return nr_of_first_indices; }
void construct_supercartesian_tree_bp_succinct(int_vector_file_buffer<fixedIntWidth>& lcp_buf, bit_vector& bp, const bool minimum=true) { typedef int_vector_size_type size_type; lcp_buf.reset(); size_type n = lcp_buf.int_vector_size; bp.resize(2*n); // resize bit vector for balanced parentheses to 2 n bits if (n == 0) // if n == 0 we are done return; util::set_to_value(bp, 0); sorted_multi_stack_support vec_stack(n); size_type k=0; if (minimum) { bp[k++] = 1; size_type r = lcp_buf.load_next_block(); size_type last = lcp_buf[0]; for (size_type i=1, r_sum = 0, x; r_sum < n;) { for (; i < r_sum +r; ++i) { x = lcp_buf[i-r_sum]; if (x < last) { ++k; // writing a closing parenthesis for last while (!vec_stack.empty() and x < vec_stack.top()) { vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros } } else { vec_stack.push(last); // "lazy stack" trick: Beschleunigung: ca 25 % } bp[k++] = 1; // writing an opening parenthesis last = x; } r_sum += r; r = lcp_buf.load_next_block(); } } else { // hier noch ohne "lazy stack" trick for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { for (; i < r_sum +r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x > vec_stack.top()) { vec_stack.pop(); ++k; // writing a closing parenthesis, bp is already initialized to zeros } vec_stack.push(x); bp[k++] = 1; // writing an opening parenthesis } r_sum += r; r = lcp_buf.load_next_block(); } } }
void calculate_character_occurences(int_vector_file_buffer<8, size_type_class>& text, const size_type size, size_type* C) { text.reset(); if (text.int_vector_size < size) { throw std::logic_error("calculate_character_occurences: stream size is smaller than size!"); return; } for (size_type i=0, r_sum=0, r = text.load_next_block(); r_sum < size;) { if (r_sum + r > size) { // read not more than size chars in the next loop r = size-r_sum; } for (; i < r_sum+r; ++i) { ++C[text[i-r_sum]]; } r_sum += r; r = text.load_next_block(); } }
/*! * \param text_buf Byte stream. * \param len Length of the byte stream. */ int_alphabet_strategy(int_vector_file_buffer<0> &text_buf, int_vector_size_type len): char2comp(this), comp2char(this), C(m_C), sigma(m_sigma) { m_sigma = 0; text_buf.reset(); if (0 == len or 0 == text_buf.int_vector_size) return; assert( len <= text_buf.int_vector_size ); // initialize vectors std::map<size_type, size_type> D; // count occurrences of each symbol for (size_type i=0, r_sum=0, r = text_buf.load_next_block(); i < len;) { for (; i < r_sum+r; ++i) { D[text_buf[i-r_sum]]++; } r_sum += r; r = text_buf.load_next_block(); } m_sigma = D.size(); if ( is_continuous_alphabet(D) ){ // do not initialize m_char, m_char_rank and m_char_select since we can map directly }else{ // note: the alphabet has at least size 1, so the following is safe: size_type largest_symbol = (--D.end())->first; bit_vector tmp_char(largest_symbol+1, 0); for (std::map<size_type, size_type>::const_iterator it = D.begin(), end=D.end(); it != end; ++it){ tmp_char[it->first] = 1; } util::assign(m_char, tmp_char); util::init_support(m_char_rank, &m_char); util::init_support(m_char_select, &m_char); } assert(D.find(0) != D.end() and 1 == D[0]); // null-byte should occur exactly once // resize to sigma+1, since CSAs also need the sum of all elements util::assign(m_C, C_type(m_sigma+1, 0, bit_magic::l1BP(len)+1) ); size_type sum = 0, idx=0; for (std::map<size_type, size_type>::const_iterator it = D.begin(), end=D.end(); it != end; ++it){ m_C[idx++] = sum; sum += it->second; } m_C[idx] = sum; // insert sum of all elements }
/*! * \param text_buf Byte stream. * \param len Length of the byte stream. */ succinct_byte_alphabet_strategy(int_vector_file_buffer<8> &text_buf, int_vector_size_type len): char2comp(this), comp2char(this), C(m_C), sigma(m_sigma) { m_sigma = 0; text_buf.reset(); if (0 == len or 0 == text_buf.int_vector_size) return; assert( len <= text_buf.int_vector_size ); // initialize vectors int_vector<64> D(257, 0); bit_vector tmp_char(256, 0); // count occurrences of each symbol for (size_type i=0, r_sum=0, r = text_buf.load_next_block(); i < len;) { for (; i < r_sum+r; ++i) { ++D[text_buf[i-r_sum]]; } r_sum += r; r = text_buf.load_next_block(); } assert(1 == D[0]); // null-byte should occur exactly once m_sigma = 0; for (int i=0; i<256; ++i) if (D[i]) { tmp_char[i] = 1; // mark occurring character D[m_sigma] = D[i]; // compactify m_C ++m_sigma; } // resize to sigma+1, since CSAs also need the sum of all elements util::assign(m_C, C_type(m_sigma+1, 0, bit_magic::l1BP(len)+1)); for (int i=(int)m_sigma; i > 0; --i) m_C[i] = D[i-1]; m_C[0] = 0; for (int i=1; i <= (int)m_sigma; ++i) m_C[i] = m_C[i] + m_C[i-1]; assert(m_C[sigma]==len); util::assign(m_char, tmp_char); util::init_support(m_char_rank, &m_char); util::init_support(m_char_select, &m_char); }
int_vector_size_type construct_supercartesian_tree_bp_succinct_and_first_child(int_vector_file_buffer<fixedIntWidth>& lcp_buf, bit_vector& bp, bit_vector& bp_fc, const bool minimum=true) { typedef int_vector_size_type size_type; lcp_buf.reset(); size_type n = lcp_buf.int_vector_size; bp.resize(2*n); // resize bit vector for balanaced parantheses to 2 n bits bp_fc.resize(n); if (n == 0) // if n == 0 we are done return 0; size_type fc_cnt=0; // first child counter util::set_to_value(bp, 0); util::set_to_value(bp_fc, 0); sorted_multi_stack_support vec_stack(n); size_type k=0; size_type k_fc=0; // first child index if (minimum) { // hier noch ohne "lazy stack" trick for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { for (; i < r_sum +r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x < vec_stack.top()) { if (vec_stack.pop()) { bp_fc[k_fc] = 1; ++fc_cnt; } ++k; // writing a closing parenthesis, bp is already initialized to zeros ++k_fc; // write a bit in first_child } vec_stack.push(x); bp[k++] = 1; // writing an opening parenthesis } r_sum += r; r = lcp_buf.load_next_block(); } } else { // hier noch ohne "lazy stack" trick for (size_type i=0, r_sum = 0, r = lcp_buf.load_next_block(), x; r_sum < n;) { for (; i < r_sum +r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x > vec_stack.top()) { if (vec_stack.pop()) { bp_fc[k_fc] = 1; ++fc_cnt; } ++k; // writing a closing parenthesis, bp is already initialized to zeros ++k_fc; // write a bit in first_child } vec_stack.push(x); bp[k++] = 1; // writing an opening parenthesis } r_sum += r; r = lcp_buf.load_next_block(); } } while (!vec_stack.empty()) { if (vec_stack.pop()) { bp_fc[k_fc] = 1; ++fc_cnt; } // writing a closing parenthesis in bp, not necessary as bp is initalized with zeros ++k; ++k_fc; } // assert( k == 2*vec.size() ); return fc_cnt; }