//! Construct the doc_border bitvector by streaming the text file void construct_doc_border(const std::string& text_file, bit_vector& doc_border) { int_vector_buffer<WIDTH> text_buf(text_file); doc_border = bit_vector(text_buf.size(), 0); for (size_type i = 0; i < text_buf.size(); ++i) { if (t_doc_delim == text_buf[i]) { doc_border[i] = 1; } } }
void bit_index_storage::remove_row(const string& row) { if (bitvals_.find(row) == bitvals_.end()) { // The row is not in the master table; we can // immedeately remove it from the diff table. bitvals_diff_.erase(row); } else { // Keep the row in the diff table until next MIX to // propagate the removal of this row to other nodes. bitvals_diff_[row] = bit_vector(); } }
int_vector::int_vector(const size_type count, const int bpe) : num_elems{count}, bits_per_element{bpe} { assert(count >= 0); assert(bpe >= 0); constexpr auto bits_per_block = std::numeric_limits<value_type>::digits; if (bits_per_element >= bits_per_block) { throw std::domain_error("int_vector: Too many bits per element"); } bit_seq = bit_vector(num_elems * bits_per_element); }
bit_vector::size_type construct_first_p_index(int_vector_file_buffer<fixedIntWidth>& lcp_buf, bit_vector& bp, const bool minimum=true) { typedef bit_vector::size_type size_type; size_type nr_of_first_indices = 0; lcp_buf.reset(); size_type n = lcp_buf.int_vector_size; bp = bit_vector(n, 0); sorted_multi_stack_support vec_stack(n); size_type k=0; if (minimum) { for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) { for (; i<r_sum+r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x < vec_stack.top()) { if (vec_stack.pop()) { bp[k] = 1; ++nr_of_first_indices; } ++k; } vec_stack.push(x); } r_sum += r; r = lcp_buf.load_next_block(); } } else { for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) { for (; i<r_sum+r; ++i) { x = lcp_buf[i-r_sum]; while (!vec_stack.empty() and x > vec_stack.top()) { if (vec_stack.pop()) { bp[k] = 1; ++nr_of_first_indices; } ++k; } vec_stack.push(x); } r_sum += r; r = lcp_buf.load_next_block(); } } while (!vec_stack.empty()) { if (vec_stack.pop()) { bp[k] = 1; ++nr_of_first_indices; } ++k; } // assert( k == vec.size() ); return nr_of_first_indices; }
void construct(){ if( m_v == NULL ){ m_sct_bp = bit_vector(0); m_sct_bp_support = Bp_support(); }else{ #ifdef RMQ_SCT_BUILD_BP_NOT_SUCCINCT // this method takes \f$n\log n\f$ bits extra space in the worst case algorithm::construct_supercartesian_tree_bp(*m_v, m_sct_bp); #else // this method takes only \f$n\f$ bits extra space in all cases algorithm::construct_supercartesian_tree_bp_succinct(*m_v, m_sct_bp); #endif m_sct_bp_support = Bp_support(&m_sct_bp); } }
rmq_succinct_sct(const RandomAccessContainer* v=NULL) : sct_bp(m_sct_bp), sct_bp_support(m_sct_bp_support) { if (v == NULL) { util::assign(m_sct_bp, bit_vector()); util::assign(m_sct_bp_support, Bp_support()); } else { #ifdef RMQ_SCT_BUILD_BP_NOT_SUCCINCT // this method takes \f$n\log n\f$ bits extra space in the worst case algorithm::construct_supercartesian_tree_bp(*v, m_sct_bp, Minimum); #else // this method takes only \f$n\f$ bits extra space in all cases algorithm::construct_supercartesian_tree_bp_succinct(*v, m_sct_bp, Minimum); // TODO: constructor which uses int_vector_file_buffer #endif util::assign(m_sct_bp_support, Bp_support(&m_sct_bp)); } }
bool bit_index_storage::put_diff( const bit_table_t& mixed_diff) { for (bit_table_t::const_iterator it = mixed_diff.begin(); it != mixed_diff.end(); ++it) { if (it->second.bit_num() == 0) { // 0-bit bit_vector was propagated from other nodes. This indicates // that the row should be removed globally from the master table. if (unlearner_) { unlearner_->remove(it->first); } bitvals_.erase(it->first); } else { if (unlearner_) { if (unlearner_->can_touch(it->first)) { unlearner_->touch(it->first); } else { continue; // drop untouchable value } } bitvals_[it->first] = it->second; } } // New empty rows were created by unlearner and remove_row // between get_diff and put_diff std::vector<std::string> removed_ids; for (bit_table_t::const_iterator it = bitvals_diff_.begin(); it != bitvals_diff_.end(); ++it) { if (it->second.bit_num() == 0) { bit_table_t::const_iterator pos; pos = mixed_diff.find(it->first); if (pos == mixed_diff.end() || pos->second.bit_num() != 0) { removed_ids.push_back(it->first); } } } bitvals_diff_.clear(); // Keep empty rows in the diff area until next MIX to // propagate the removal of this data to other nodes. for (size_t i = 0; i < removed_ids.size(); ++i) { bitvals_diff_[removed_ids[i]] = bit_vector(); } return true; }
void bit_index_storage::get_row(const string& row, bit_vector& bv) const { { bit_table_t::const_iterator it = bitvals_diff_.find(row); if (it != bitvals_diff_.end()) { bv = it->second; return; } } { bit_table_t::const_iterator it = bitvals_.find(row); if (it != bitvals_.end()) { bv = it->second; return; } } bv = bit_vector(); }
typename RandomAccessContainer::size_type construct_first_p_index(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true) { typedef typename RandomAccessContainer::size_type size_type; size_type nr_of_first_indices = 0; bp = bit_vector(vec.size(), 0); // std::cerr<<"bp.size()="<<bp.size()<<std::endl; sorted_stack_support vec_stack(vec.size()); size_type k=0; for (size_type i=0, t; i < vec.size(); ++i) { if (minimum) { while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) { t = vec[vec_stack.top()]; vec_stack.pop(); if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) { bp[k] = 1; ++nr_of_first_indices; } ++k; } } else { while (vec_stack.size() > 0 and vec[i] > vec[vec_stack.top()]) { t = vec[vec_stack.top()]; vec_stack.pop(); if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) { bp[k] = 1; ++nr_of_first_indices; } ++k; } } vec_stack.push(i); } while (vec_stack.size() > 0) { size_type t = vec[vec_stack.top()]; vec_stack.pop(); if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) { bp[k] = 1; ++nr_of_first_indices; } ++k; } assert(k == vec.size()); return nr_of_first_indices; }
/*! \param v The supported bit_vector. */ nearest_neighbour_dictionary(const bit_vector& v):m_ones(0), m_size(0) { if (sample_dens==0) { // first logical error check throw std::logic_error(util::demangle(typeid(this).name())+": sample_dens should not be equal 0!"); } size_type max_distance_between_two_ones = 0; size_type ones = 0; // counter for the ones in v // get maximal distance between to ones in the bit vector // speed this up by broadword computing for (size_type i=0, last_one_pos_plus_1=0; i < v.size(); ++i) { if (v[i]) { if (i+1-last_one_pos_plus_1 > max_distance_between_two_ones) max_distance_between_two_ones = i+1-last_one_pos_plus_1; last_one_pos_plus_1 = i+1; ++ones; } } m_ones = ones; m_size = v.size(); // std::cerr<<ones<<std::endl; // initialize absolute samples m_abs_samples[0]=0 m_abs_samples = int_vector<>(m_ones/sample_dens + 1, 0, bits::hi(v.size())+1); // initialize different values m_differences = int_vector<>(m_ones - m_ones/sample_dens, 0, bits::hi(max_distance_between_two_ones)+1); // initialize m_contains_abs_sample m_contains_abs_sample = bit_vector((v.size()+sample_dens-1)/sample_dens, 0); ones = 0; for (size_type i=0, last_one_pos=0; i < v.size(); ++i) { if (v[i]) { ++ones; if ((ones % sample_dens) == 0) { // insert absolute samples m_abs_samples[ones/sample_dens] = i; m_contains_abs_sample[i/sample_dens] = 1; } else { m_differences[ones - ones/sample_dens - 1] = i - last_one_pos; } last_one_pos = i; } } util::init_support(m_rank_contains_abs_sample, &m_contains_abs_sample); }
void bit_index_storage::get_row(const string& row, bit_vector& bv) const { { // First find the row in the diff table. bit_table_t::const_iterator it = bitvals_diff_.find(row); if (it != bitvals_diff_.end() && it->second.bit_num() != 0) { // Row found, and is not 0-bit. 0-bit rows in the diff table means // that the row has been removed but not MIXed yet. bv = it->second; return; } } { // Next we find the row in the master table. bit_table_t::const_iterator it = bitvals_.find(row); if (it != bitvals_.end()) { bv = it->second; return; } } bv = bit_vector(); }
sd_vector_builder::sd_vector_builder(size_type n, size_type m) : m_size(n), m_capacity(m), m_wl(0), m_tail(0), m_items(0), m_last_high(0), m_highpos(0) { if(m_capacity > m_size) { throw std::runtime_error("sd_vector_builder: requested capacity is larger than vector size."); } if(m_capacity == 0) { return; } size_type logm = bits::hi(m_capacity) + 1, logn = bits::hi(m_size) + 1; if(logm == logn) { logm--; // to ensure logn-logm > 0 } m_wl = logn - logm; m_low = int_vector<>(m_capacity, 0, m_wl); m_high = bit_vector(m_capacity + (1ULL << logm), 0); }
gamma_vector(Range const& ints) { darray64::builder high_bits; bit_vector_builder low_bits; high_bits.append1(); typedef typename boost::range_const_iterator<Range>::type iterator_t; for (iterator_t iter = boost::begin(ints); iter != boost::end(ints); ++iter) { const value_type val = *iter + 1; uint8_t l = broadword::msb(val); low_bits.append_bits(val ^ (uint64_t(1) << l), l); high_bits.append1(l); } darray64(&high_bits).swap(m_high_bits); bit_vector(&low_bits).swap(m_low_bits); }
TEST(bit_index_storage, mix) { bit_index_storage s1, s2, s3; s1.set_row("r1", make_vector("0101")); s1.set_row("r2", make_vector("1010")); string d1; s1.get_diff(d1); s2.set_row("r1", make_vector("1110")); s2.set_row("r3", make_vector("1100")); string d2; s2.get_diff(d2); s1.mix(d1, d2); // d2 is // r1: 0101 (s1) // r2: 1010 (s1) // r3: 1100 (s2) s3.set_row("r1", make_vector("1111")); s3.set_row("r2", make_vector("1111")); s3.set_row("r3", make_vector("1111")); s3.set_row("r4", make_vector("1111")); s3.set_mixed_and_clear_diff(d2); // r1, r2 and r3 are overwritten by d2 // r4 is no longer retained bit_vector v; s3.get_row("r1", v); EXPECT_TRUE(v == make_vector("0101")); s3.get_row("r2", v); EXPECT_TRUE(v == make_vector("1010")); s3.get_row("r3", v); EXPECT_TRUE(v == make_vector("1100")); s3.get_row("r4", v); EXPECT_TRUE(v == bit_vector()); }
lcp_dac<t_b, t_rank>::lcp_dac(cache_config& config) { // (1) Count for each level, how many blocks are needed for the representation // Running time: \f$ O(n \times \frac{\log n}{b} \f$ // Result is sorted in m_level_pointer_and_rank std::string lcp_file = cache_file_name(conf::KEY_LCP, config); int_vector_buffer<> lcp_buf(lcp_file); size_type n = lcp_buf.size(), val=0; if (n == 0) return; // initialize counter auto _size = std::max(4*bits::hi(2), 2*(((bits::hi(n)+1)+t_b-1) / t_b)); m_level_pointer_and_rank.resize(_size); for (size_type i=0; i < m_level_pointer_and_rank.size(); ++i) m_level_pointer_and_rank[i] = 0; m_level_pointer_and_rank[0] = n; // level 0 has n entries uint8_t level_x_2 = 0; for (size_type i=0; i < n; ++i) { val=lcp_buf[i]; val >>= t_b; // shift value b bits to the right level_x_2 = 2; while (val) { // increase counter for current level by 1 ++m_level_pointer_and_rank[level_x_2]; val >>= t_b; // shift value b bits to the right level_x_2 += 2; // increase level by 1 } } // (2) Determine maximum level and prefix sums of level counters m_max_level = 0; size_type sum_blocks = 0, last_block_size=0; for (size_type i=0, t=0; i < m_level_pointer_and_rank.size(); i+=2) { t = sum_blocks; sum_blocks += m_level_pointer_and_rank[i]; m_level_pointer_and_rank[i] = t; if (sum_blocks > t) { ++m_max_level; last_block_size = sum_blocks - t; } } m_overflow = bit_vector(sum_blocks - last_block_size, 0); m_data.resize(sum_blocks); assert(last_block_size > 0); // (3) Enter block and overflow data int_vector<64> cnt = m_level_pointer_and_rank; const uint64_t mask = bits::lo_set[t_b]; for (size_type i=0, j=0; i < n; ++i) { val=lcp_buf[i]; j = cnt[0]++; m_data[ j ] = val & mask; val >>= t_b; // shift value b bits to the right level_x_2 = 2; while (val) { m_overflow[j] = 1; // increase counter for current level by 1 j = cnt[level_x_2]++; m_data[ j ] = val & mask; val >>= t_b; // shift value b bits to the right level_x_2 += 2; // increase level by 1 } } // (4) Initialize rank data structure for m_overflow and precalc rank for // pointers util::init_support(m_overflow_rank, &m_overflow); for (size_type i=0; 2*i < m_level_pointer_and_rank.size() and m_level_pointer_and_rank[2*i] < m_overflow.size(); ++i) { m_level_pointer_and_rank[2*i+1] = m_overflow_rank( m_level_pointer_and_rank[2*i]); } }
/*! \param text_buf A int_vector_buffer to the original text. * \param size The length of the prefix of the text, for which the wavelet tree should be build. */ wt_int_rlmn(int_vector_buffer<>& text_buf, size_type size):m_size(size), sigma(m_wt.sigma) { if (0 == text_buf.size() or 0 == size) return; int_vector<> condensed_bwt; { // scope for bl and bf bit_vector bl = bit_vector(size, 0); std::map<uint64_t, uint64_t> C; uint64_t last_c = 0; size_type runs = 0; for (size_type i=0; i < size; ++i) { uint64_t c = text_buf[i]; if (last_c != c or i==0) { bl[i] = 1; ++runs; } ++C[c]; last_c = c; } uint64_t max_symbol = (--C.end())->first; m_C = int_vector<>(max_symbol+1, 0, bits::hi(size)+1); for (size_type i=0, prefix_sum=0; i<=max_symbol; ++i) { m_C[i] = prefix_sum; prefix_sum += C[i]; } int_vector<> lf_map = m_C; bit_vector bf = bit_vector(size+1, 0); bf[size] = 1; // initialize last element condensed_bwt = int_vector<>(runs, 0, bits::hi(max_symbol)+1); runs = 0; for (size_type i=0; i < size; ++i) { uint64_t c = text_buf[i]; if (bl[i]) { bf[lf_map[c]] = 1; condensed_bwt[runs++] = c; } ++lf_map[c]; } { // TODO: remove absolute file name std::string temp_file = "tmp_wt_int_rlmn_" + util::to_string(util::pid()) + "_" + util::to_string(util::id()); store_to_file(condensed_bwt, temp_file); util::clear(condensed_bwt); int_vector_buffer<> temp_bwt_buf(temp_file); m_wt = std::move(wt_type(temp_bwt_buf, temp_bwt_buf.size())); temp_bwt_buf.close(true); } m_bl = std::move(bit_vector_type(bl)); m_bf = std::move(bit_vector_type(bf)); } util::init_support(m_bl_rank, &m_bl); util::init_support(m_bf_rank, &m_bf); util::init_support(m_bf_select, &m_bf); util::init_support(m_bl_select, &m_bl); m_C_bf_rank = int_vector<>(m_C.size(), 0, bits::hi(size)+1); for (size_type i=0; i<m_C.size(); ++i) { m_C_bf_rank[i] = m_bf_rank(m_C[i]); } }
/*! * \param bv Uncompressed bitvector. * \param k Store rank samples and pointers each k-th blocks. */ rrr_vector(const bit_vector& bv) { m_size = bv.size(); int_vector<> bt_array; bt_array.width(bits::hi(t_bs)+1); bt_array.resize((m_size+t_bs)/((size_type)t_bs)); // blocks for the bt_array + a dummy block at the end, // if m_size%t_bs == 0 // (1) calculate the block types and store them in m_bt size_type pos = 0, i = 0, x; size_type btnr_pos = 0; size_type sum_rank = 0; while (pos + t_bs <= m_size) { // handle all blocks full blocks bt_array[ i++ ] = x = rrr_helper_type::get_bt(bv, pos, t_bs); sum_rank += x; btnr_pos += rrr_helper_type::space_for_bt(x); pos += t_bs; } if (pos < m_size) { // handle last not full block bt_array[ i++ ] = x = rrr_helper_type::get_bt(bv, pos, m_size - pos); sum_rank += x; btnr_pos += rrr_helper_type::space_for_bt(x); } m_btnr = bit_vector(std::max(btnr_pos, (size_type)64), 0); // max necessary for case: t_bs == 1 m_btnrp = int_vector<>((bt_array.size()+t_k-1)/t_k, 0, bits::hi(btnr_pos)+1); m_rank = int_vector<>((bt_array.size()+t_k-1)/t_k + ((m_size % (t_k*t_bs))>0), 0, bits::hi(sum_rank)+1); // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // only add a finishing block, if the last block of the superblock is not a dummy block m_invert = bit_vector((bt_array.size()+t_k-1)/t_k, 0); // (2) calculate block type numbers and pointers into btnr and rank samples pos = 0; i = 0; btnr_pos= 0, sum_rank = 0; bool invert = false; while (pos + t_bs <= m_size) { // handle all full blocks if ((i % t_k) == (size_type)0) { m_btnrp[ i/t_k ] = btnr_pos; m_rank[ i/t_k ] = sum_rank; // calculate invert bit for that superblock if (i+t_k <= bt_array.size()) { size_type gt_half_t_bs = 0; // counter for blocks greater than half of the blocksize for (size_type j=i; j < i+t_k; ++j) { if (bt_array[j] > t_bs/2) ++gt_half_t_bs; } if (gt_half_t_bs > (t_k/2)) { m_invert[ i/t_k ] = 1; for (size_type j=i; j < i+t_k; ++j) { bt_array[j] = t_bs - bt_array[j]; } invert = true; } else { invert = false; } } else { invert = false; } } uint16_t space_for_bt = rrr_helper_type::space_for_bt(x=bt_array[i++]); sum_rank += (invert ? (t_bs - x) : x); if (space_for_bt) { number_type bin = rrr_helper_type::decode_btnr(bv, pos, t_bs); number_type nr = rrr_helper_type::bin_to_nr(bin); rrr_helper_type::set_bt(m_btnr, btnr_pos, nr, space_for_bt); } btnr_pos += space_for_bt; pos += t_bs; } if (pos < m_size) { // handle last not full block if ((i % t_k) == (size_type)0) { m_btnrp[ i/t_k ] = btnr_pos; m_rank[ i/t_k ] = sum_rank; m_invert[ i/t_k ] = 0; // default: set last block to not inverted invert = false; } uint16_t space_for_bt = rrr_helper_type::space_for_bt(x=bt_array[i++]); // no extra dummy block added to bt_array, therefore this condition should hold assert(i == bt_array.size()); sum_rank += invert ? (t_bs - x) : x; if (space_for_bt) { number_type bin = rrr_helper_type::decode_btnr(bv, pos, m_size-pos); number_type nr = rrr_helper_type::bin_to_nr(bin); rrr_helper_type::set_bt(m_btnr, btnr_pos, nr, space_for_bt); } btnr_pos += space_for_bt; assert(m_rank.size()-1 == ((i+t_k-1)/t_k)); } else { // handle last empty full block assert(m_rank.size()-1 == ((i+t_k-1)/t_k)); } // for technical reasons we add a last element to m_rank m_rank[ m_rank.size()-1 ] = sum_rank; // sum_rank contains the total number of set bits in bv m_bt = bt_array; }