Пример #1
0
 //! Construct the doc_border bitvector by streaming the text file
 void
 construct_doc_border(const std::string& text_file, bit_vector& doc_border) {
     int_vector_buffer<WIDTH> text_buf(text_file);
     doc_border = bit_vector(text_buf.size(), 0);
     for (size_type i = 0; i < text_buf.size(); ++i) {
         if (t_doc_delim == text_buf[i]) {
             doc_border[i] = 1;
         }
     }
 }
Пример #2
0
void bit_index_storage::remove_row(const string& row) {
  if (bitvals_.find(row) == bitvals_.end()) {
    // The row is not in the master table; we can
    // immedeately remove it from the diff table.
    bitvals_diff_.erase(row);
  } else {
    // Keep the row in the diff table until next MIX to
    // propagate the removal of this row to other nodes.
    bitvals_diff_[row] = bit_vector();
  }
}
Пример #3
0
int_vector::int_vector(const size_type count, const int bpe)
    : num_elems{count}, bits_per_element{bpe} {
  assert(count >= 0);
  assert(bpe >= 0);

  constexpr auto bits_per_block = std::numeric_limits<value_type>::digits;
  if (bits_per_element >= bits_per_block) {
    throw std::domain_error("int_vector: Too many bits per element");
  }

  bit_seq = bit_vector(num_elems * bits_per_element);
}
bit_vector::size_type construct_first_p_index(int_vector_file_buffer<fixedIntWidth>& lcp_buf, bit_vector& bp, const bool minimum=true)
{
    typedef bit_vector::size_type size_type;
    size_type nr_of_first_indices = 0;
    lcp_buf.reset();
    size_type n = lcp_buf.int_vector_size;

    bp = bit_vector(n, 0);
    sorted_multi_stack_support vec_stack(n);
    size_type k=0;

    if (minimum) {
        for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) {
            for (; i<r_sum+r; ++i) {
                x = lcp_buf[i-r_sum];
                while (!vec_stack.empty() and x < vec_stack.top()) {
                    if (vec_stack.pop()) {
                        bp[k] = 1;
                        ++nr_of_first_indices;
                    }
                    ++k;
                }
                vec_stack.push(x);
            }
            r_sum += r; r = lcp_buf.load_next_block();
        }
    } else {
        for (size_type i = 0, r_sum = 0, r = lcp_buf.load_next_block(),x; r_sum < n;) {
            for (; i<r_sum+r; ++i) {
                x = lcp_buf[i-r_sum];
                while (!vec_stack.empty() and x > vec_stack.top()) {
                    if (vec_stack.pop()) {
                        bp[k] = 1;
                        ++nr_of_first_indices;
                    }
                    ++k;
                }
                vec_stack.push(x);
            }
            r_sum += r; r = lcp_buf.load_next_block();
        }
    }

    while (!vec_stack.empty()) {
        if (vec_stack.pop()) {
            bp[k] = 1;
            ++nr_of_first_indices;
        }
        ++k;
    }
//	assert( k == vec.size() );
    return nr_of_first_indices;
}
	void construct(){
		if( m_v == NULL ){
			m_sct_bp = bit_vector(0); m_sct_bp_support = Bp_support();
		}else{
#ifdef RMQ_SCT_BUILD_BP_NOT_SUCCINCT			
			// this method takes \f$n\log n\f$ bits extra space in the worst case
			algorithm::construct_supercartesian_tree_bp(*m_v, m_sct_bp);
#else			
			// this method takes only \f$n\f$ bits extra space in all cases 
			algorithm::construct_supercartesian_tree_bp_succinct(*m_v, m_sct_bp);
#endif			
			m_sct_bp_support = Bp_support(&m_sct_bp);
		}
	}
Пример #6
0
        rmq_succinct_sct(const RandomAccessContainer* v=NULL) : sct_bp(m_sct_bp), sct_bp_support(m_sct_bp_support) {
            if (v == NULL) {
				util::assign(m_sct_bp, bit_vector()); util::assign(m_sct_bp_support, Bp_support());
            } else {
#ifdef RMQ_SCT_BUILD_BP_NOT_SUCCINCT
                // this method takes \f$n\log n\f$ bits extra space in the worst case
                algorithm::construct_supercartesian_tree_bp(*v, m_sct_bp, Minimum);
#else
                // this method takes only \f$n\f$ bits extra space in all cases
                algorithm::construct_supercartesian_tree_bp_succinct(*v, m_sct_bp, Minimum);
                //  TODO: constructor which uses int_vector_file_buffer
#endif
                util::assign(m_sct_bp_support, Bp_support(&m_sct_bp));
            }
        }
Пример #7
0
bool bit_index_storage::put_diff(
    const bit_table_t& mixed_diff) {
  for (bit_table_t::const_iterator it = mixed_diff.begin();
      it != mixed_diff.end(); ++it) {
    if (it->second.bit_num() == 0) {
      // 0-bit bit_vector was propagated from other nodes.  This indicates
      // that the row should be removed globally from the master table.
      if (unlearner_) {
        unlearner_->remove(it->first);
      }
      bitvals_.erase(it->first);
    } else {
      if (unlearner_) {
        if (unlearner_->can_touch(it->first)) {
          unlearner_->touch(it->first);
        } else {
          continue;  // drop untouchable value
        }
      }
      bitvals_[it->first] = it->second;
    }
  }

  // New empty rows were created by unlearner and remove_row
  // between get_diff and put_diff
  std::vector<std::string> removed_ids;
  for (bit_table_t::const_iterator it = bitvals_diff_.begin();
      it != bitvals_diff_.end(); ++it) {
    if (it->second.bit_num() == 0) {
      bit_table_t::const_iterator pos;
      pos = mixed_diff.find(it->first);
      if (pos == mixed_diff.end() || pos->second.bit_num() != 0) {
        removed_ids.push_back(it->first);
      }
    }
  }

  bitvals_diff_.clear();

  // Keep empty rows in the diff area until next MIX to
  // propagate the removal of this data to other nodes.
  for (size_t i = 0; i < removed_ids.size(); ++i) {
    bitvals_diff_[removed_ids[i]] = bit_vector();
  }

  return true;
}
Пример #8
0
void bit_index_storage::get_row(const string& row, bit_vector& bv) const {
  {
    bit_table_t::const_iterator it = bitvals_diff_.find(row);
    if (it != bitvals_diff_.end()) {
      bv = it->second;
      return;
    }
  }
  {
    bit_table_t::const_iterator it = bitvals_.find(row);
    if (it != bitvals_.end()) {
      bv = it->second;
      return;
    }
  }
  bv = bit_vector();
}
typename RandomAccessContainer::size_type construct_first_p_index(const RandomAccessContainer& vec, bit_vector& bp, const bool minimum=true)
{
    typedef typename RandomAccessContainer::size_type size_type;
    size_type nr_of_first_indices = 0;
    bp = bit_vector(vec.size(), 0);
//	std::cerr<<"bp.size()="<<bp.size()<<std::endl;
    sorted_stack_support vec_stack(vec.size());
    size_type k=0;
    for (size_type i=0, t; i < vec.size(); ++i) {
        if (minimum) {
            while (vec_stack.size() > 0 and vec[i] < vec[vec_stack.top()]) {
                t = vec[vec_stack.top()];
                vec_stack.pop();
                if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) {
                    bp[k] = 1;
                    ++nr_of_first_indices;
                }
                ++k;

            }
        } else {
            while (vec_stack.size() > 0 and vec[i] > vec[vec_stack.top()]) {
                t = vec[vec_stack.top()];
                vec_stack.pop();
                if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) {
                    bp[k] = 1;
                    ++nr_of_first_indices;
                }
                ++k;
            }
        }
        vec_stack.push(i);
    }
    while (vec_stack.size() > 0) {
        size_type t = vec[vec_stack.top()];
        vec_stack.pop();
        if (vec_stack.size() == 0 or t != vec[vec_stack.top()]) {
            bp[k] = 1;
            ++nr_of_first_indices;
        }
        ++k;
    }
    assert(k == vec.size());
    return nr_of_first_indices;
}
        /*! \param v The supported bit_vector.
         */
        nearest_neighbour_dictionary(const bit_vector& v):m_ones(0), m_size(0) {
            if (sample_dens==0) { // first logical error check
                throw std::logic_error(util::demangle(typeid(this).name())+": sample_dens should not be equal 0!");
            }
            size_type max_distance_between_two_ones = 0;
            size_type ones = 0; // counter for the ones in v

            // get maximal distance between to ones in the bit vector
            // speed this up by broadword computing
            for (size_type i=0, last_one_pos_plus_1=0; i < v.size(); ++i) {
                if (v[i]) {
                    if (i+1-last_one_pos_plus_1 > max_distance_between_two_ones)
                        max_distance_between_two_ones = i+1-last_one_pos_plus_1;
                    last_one_pos_plus_1 = i+1;
                    ++ones;

                }
            }
            m_ones = ones;
            m_size = v.size();
//			std::cerr<<ones<<std::endl;
            // initialize absolute samples m_abs_samples[0]=0
            m_abs_samples = int_vector<>(m_ones/sample_dens + 1, 0,  bits::hi(v.size())+1);
            // initialize different values
            m_differences = int_vector<>(m_ones - m_ones/sample_dens, 0, bits::hi(max_distance_between_two_ones)+1);
            // initialize m_contains_abs_sample
            m_contains_abs_sample = bit_vector((v.size()+sample_dens-1)/sample_dens, 0);
            ones = 0;
            for (size_type i=0, last_one_pos=0; i < v.size(); ++i) {
                if (v[i]) {
                    ++ones;
                    if ((ones % sample_dens) == 0) {  // insert absolute samples
                        m_abs_samples[ones/sample_dens] = i;
                        m_contains_abs_sample[i/sample_dens] = 1;
                    } else {
                        m_differences[ones - ones/sample_dens - 1] = i - last_one_pos;
                    }
                    last_one_pos = i;
                }
            }
            util::init_support(m_rank_contains_abs_sample, &m_contains_abs_sample);
        }
Пример #11
0
void bit_index_storage::get_row(const string& row, bit_vector& bv) const {
  {
    // First find the row in the diff table.
    bit_table_t::const_iterator it = bitvals_diff_.find(row);
    if (it != bitvals_diff_.end() && it->second.bit_num() != 0) {
      // Row found, and is not 0-bit.  0-bit rows in the diff table means
      // that the row has been removed but not MIXed yet.
      bv = it->second;
      return;
    }
  }
  {
    // Next we find the row in the master table.
    bit_table_t::const_iterator it = bitvals_.find(row);
    if (it != bitvals_.end()) {
      bv = it->second;
      return;
    }
  }
  bv = bit_vector();
}
Пример #12
0
sd_vector_builder::sd_vector_builder(size_type n, size_type m) :
    m_size(n), m_capacity(m),
    m_wl(0),
    m_tail(0), m_items(0),
    m_last_high(0), m_highpos(0)
{
    if(m_capacity > m_size)
    {
        throw std::runtime_error("sd_vector_builder: requested capacity is larger than vector size.");
    }
    if(m_capacity == 0) { return; }

    size_type logm = bits::hi(m_capacity) + 1, logn = bits::hi(m_size) + 1;
    if(logm == logn)
    {
        logm--; // to ensure logn-logm > 0
    }
    m_wl = logn - logm;
    m_low = int_vector<>(m_capacity, 0, m_wl);
    m_high = bit_vector(m_capacity + (1ULL << logm), 0);
}
Пример #13
0
        gamma_vector(Range const& ints)
        {
            darray64::builder high_bits;
            bit_vector_builder low_bits;

            high_bits.append1();

            typedef typename boost::range_const_iterator<Range>::type iterator_t;
            for (iterator_t iter = boost::begin(ints);
                 iter != boost::end(ints);
                 ++iter) {
                const value_type val = *iter + 1;

                uint8_t l = broadword::msb(val);

                low_bits.append_bits(val ^ (uint64_t(1) << l), l);
                high_bits.append1(l);
            }

            darray64(&high_bits).swap(m_high_bits);
            bit_vector(&low_bits).swap(m_low_bits);
        }
Пример #14
0
TEST(bit_index_storage, mix) {
    bit_index_storage s1, s2, s3;
    s1.set_row("r1", make_vector("0101"));
    s1.set_row("r2", make_vector("1010"));
    string d1;
    s1.get_diff(d1);

    s2.set_row("r1", make_vector("1110"));
    s2.set_row("r3", make_vector("1100"));
    string d2;
    s2.get_diff(d2);

    s1.mix(d1, d2);

    // d2 is
    // r1: 0101 (s1)
    // r2: 1010 (s1)
    // r3: 1100 (s2)

    s3.set_row("r1", make_vector("1111"));
    s3.set_row("r2", make_vector("1111"));
    s3.set_row("r3", make_vector("1111"));
    s3.set_row("r4", make_vector("1111"));
    s3.set_mixed_and_clear_diff(d2);

    // r1, r2 and r3 are overwritten by d2
    // r4 is no longer retained

    bit_vector v;
    s3.get_row("r1", v);
    EXPECT_TRUE(v == make_vector("0101"));
    s3.get_row("r2", v);
    EXPECT_TRUE(v == make_vector("1010"));
    s3.get_row("r3", v);
    EXPECT_TRUE(v == make_vector("1100"));
    s3.get_row("r4", v);
    EXPECT_TRUE(v == bit_vector());
}
Пример #15
0
lcp_dac<t_b, t_rank>::lcp_dac(cache_config& config)
{
//  (1) Count for each level, how many blocks are needed for the representation
//      Running time: \f$ O(n \times \frac{\log n}{b}  \f$
//      Result is sorted in m_level_pointer_and_rank
    std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
    int_vector_buffer<> lcp_buf(lcp_file);
    size_type n = lcp_buf.size(), val=0;
    if (n == 0)
        return;
// initialize counter
    auto _size =  std::max(4*bits::hi(2), 2*(((bits::hi(n)+1)+t_b-1) / t_b));
    m_level_pointer_and_rank.resize(_size);
    for (size_type i=0; i < m_level_pointer_and_rank.size(); ++i)
        m_level_pointer_and_rank[i] = 0;
    m_level_pointer_and_rank[0] = n; // level 0 has n entries

    uint8_t level_x_2 = 0;
    for (size_type i=0; i < n; ++i) {
        val=lcp_buf[i];
        val >>= t_b; // shift value b bits to the right
        level_x_2 = 2;
        while (val) {
            // increase counter for current level by 1
            ++m_level_pointer_and_rank[level_x_2];
            val >>= t_b; // shift value b bits to the right
            level_x_2 += 2; // increase level by 1
        }
    }

//  (2)    Determine maximum level and prefix sums of level counters
    m_max_level = 0;
    size_type sum_blocks = 0, last_block_size=0;
    for (size_type i=0, t=0; i < m_level_pointer_and_rank.size(); i+=2) {
        t = sum_blocks;
        sum_blocks += m_level_pointer_and_rank[i];
        m_level_pointer_and_rank[i] = t;
        if (sum_blocks > t) {
            ++m_max_level;
            last_block_size = sum_blocks - t;
        }
    }
    m_overflow = bit_vector(sum_blocks - last_block_size, 0);
    m_data.resize(sum_blocks);

    assert(last_block_size > 0);

//  (3)    Enter block and overflow data
    int_vector<64> cnt = m_level_pointer_and_rank;
    const uint64_t mask = bits::lo_set[t_b];

    for (size_type i=0, j=0; i < n; ++i) {
        val=lcp_buf[i];
        j = cnt[0]++;
        m_data[ j ] =  val & mask;
        val >>= t_b; // shift value b bits to the right
        level_x_2 = 2;
        while (val) {
            m_overflow[j] = 1;
            // increase counter for current level by 1
            j = cnt[level_x_2]++;
            m_data[ j ] = val & mask;
            val >>= t_b; // shift value b bits to the right
            level_x_2 += 2; // increase level by 1
        }
    }

//  (4) Initialize rank data structure for m_overflow and precalc rank for
//      pointers
    util::init_support(m_overflow_rank, &m_overflow);
    for (size_type i=0; 2*i < m_level_pointer_and_rank.size() and
         m_level_pointer_and_rank[2*i] < m_overflow.size(); ++i) {
        m_level_pointer_and_rank[2*i+1] = m_overflow_rank(
                                              m_level_pointer_and_rank[2*i]);
    }
}
Пример #16
0
        /*! \param text_buf A int_vector_buffer to the original text.
         *  \param size     The length of the prefix of the text, for which the wavelet tree should be build.
         */
        wt_int_rlmn(int_vector_buffer<>& text_buf, size_type size):m_size(size), sigma(m_wt.sigma) {
            if (0 == text_buf.size() or 0 == size)
                return;
            int_vector<> condensed_bwt;
            {
                // scope for bl and bf
                bit_vector bl = bit_vector(size, 0);
                std::map<uint64_t, uint64_t> C;
                uint64_t last_c = 0;
                size_type runs = 0;
                for (size_type i=0; i < size; ++i) {
                    uint64_t c = text_buf[i];
                    if (last_c != c or i==0) {
                        bl[i] = 1;
                        ++runs;
                    }
                    ++C[c];
                    last_c = c;
                }
                uint64_t max_symbol = (--C.end())->first;
                m_C = int_vector<>(max_symbol+1, 0, bits::hi(size)+1);
                for (size_type i=0, prefix_sum=0; i<=max_symbol; ++i) {
                    m_C[i] = prefix_sum;
                    prefix_sum += C[i];
                }

                int_vector<> lf_map = m_C;
                bit_vector bf = bit_vector(size+1, 0);
                bf[size] = 1; // initialize last element
                condensed_bwt = int_vector<>(runs, 0, bits::hi(max_symbol)+1);
                runs = 0;
                for (size_type i=0; i < size; ++i) {
                    uint64_t c = text_buf[i];
                    if (bl[i]) {
                        bf[lf_map[c]] = 1;
                        condensed_bwt[runs++] = c;
                    }
                    ++lf_map[c];
                }
                {
                    // TODO: remove absolute file name
                    std::string temp_file = "tmp_wt_int_rlmn_" + util::to_string(util::pid()) + "_" + util::to_string(util::id());
                    store_to_file(condensed_bwt, temp_file);
                    util::clear(condensed_bwt);
                    int_vector_buffer<> temp_bwt_buf(temp_file);
                    m_wt = std::move(wt_type(temp_bwt_buf, temp_bwt_buf.size()));
                    temp_bwt_buf.close(true);
                }
                m_bl = std::move(bit_vector_type(bl));
                m_bf = std::move(bit_vector_type(bf));
            }

            util::init_support(m_bl_rank, &m_bl);
            util::init_support(m_bf_rank, &m_bf);
            util::init_support(m_bf_select, &m_bf);
            util::init_support(m_bl_select, &m_bl);
            m_C_bf_rank = int_vector<>(m_C.size(), 0, bits::hi(size)+1);
            for (size_type i=0; i<m_C.size(); ++i) {
                m_C_bf_rank[i] = m_bf_rank(m_C[i]);
            }
        }
Пример #17
0
        /*!
        *  \param bv  Uncompressed bitvector.
        *  \param k   Store rank samples and pointers each k-th blocks.
        */
        rrr_vector(const bit_vector& bv) {
            m_size = bv.size();
            int_vector<> bt_array;
            bt_array.width(bits::hi(t_bs)+1);
            bt_array.resize((m_size+t_bs)/((size_type)t_bs)); // blocks for the bt_array + a dummy block at the end,
            // if m_size%t_bs == 0

            // (1) calculate the block types and store them in m_bt
            size_type pos = 0, i = 0, x;
            size_type btnr_pos = 0;
            size_type sum_rank = 0;
            while (pos + t_bs <= m_size) { // handle all blocks full blocks
                bt_array[ i++ ] = x = rrr_helper_type::get_bt(bv, pos, t_bs);
                sum_rank += x;
                btnr_pos += rrr_helper_type::space_for_bt(x);
                pos += t_bs;
            }
            if (pos < m_size) { // handle last not full block
                bt_array[ i++ ] = x = rrr_helper_type::get_bt(bv, pos, m_size - pos);
                sum_rank += x;
                btnr_pos += rrr_helper_type::space_for_bt(x);
            }
            m_btnr  = bit_vector(std::max(btnr_pos, (size_type)64), 0);      // max necessary for case: t_bs == 1
            m_btnrp = int_vector<>((bt_array.size()+t_k-1)/t_k, 0,  bits::hi(btnr_pos)+1);
            m_rank  = int_vector<>((bt_array.size()+t_k-1)/t_k + ((m_size % (t_k*t_bs))>0), 0, bits::hi(sum_rank)+1);
            //                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
            //   only add a finishing block, if the last block of the superblock is not a dummy block
            m_invert = bit_vector((bt_array.size()+t_k-1)/t_k, 0);

            // (2) calculate block type numbers and pointers into btnr and rank samples
            pos = 0; i = 0;
            btnr_pos= 0, sum_rank = 0;
            bool invert = false;
            while (pos + t_bs <= m_size) {  // handle all full blocks
                if ((i % t_k) == (size_type)0) {
                    m_btnrp[ i/t_k ] = btnr_pos;
                    m_rank[ i/t_k ] = sum_rank;
                    // calculate invert bit for that superblock
                    if (i+t_k <= bt_array.size()) {
                        size_type gt_half_t_bs = 0; // counter for blocks greater than half of the blocksize
                        for (size_type j=i; j < i+t_k; ++j) {
                            if (bt_array[j] > t_bs/2)
                                ++gt_half_t_bs;
                        }
                        if (gt_half_t_bs > (t_k/2)) {
                            m_invert[ i/t_k ] = 1;
                            for (size_type j=i; j < i+t_k; ++j) {
                                bt_array[j] = t_bs - bt_array[j];
                            }
                            invert = true;
                        } else {
                            invert = false;
                        }
                    } else {
                        invert = false;
                    }
                }
                uint16_t space_for_bt = rrr_helper_type::space_for_bt(x=bt_array[i++]);
                sum_rank += (invert ? (t_bs - x) : x);
                if (space_for_bt) {
                    number_type bin = rrr_helper_type::decode_btnr(bv, pos, t_bs);
                    number_type nr = rrr_helper_type::bin_to_nr(bin);
                    rrr_helper_type::set_bt(m_btnr, btnr_pos, nr, space_for_bt);
                }
                btnr_pos += space_for_bt;
                pos += t_bs;
            }
            if (pos < m_size) { // handle last not full block
                if ((i % t_k) == (size_type)0) {
                    m_btnrp[ i/t_k ] = btnr_pos;
                    m_rank[ i/t_k ] = sum_rank;
                    m_invert[ i/t_k ] = 0; // default: set last block to not inverted
                    invert = false;
                }
                uint16_t space_for_bt = rrr_helper_type::space_for_bt(x=bt_array[i++]);
//          no extra dummy block added to bt_array, therefore this condition should hold
                assert(i == bt_array.size());
                sum_rank += invert ? (t_bs - x) : x;
                if (space_for_bt) {
                    number_type bin = rrr_helper_type::decode_btnr(bv, pos, m_size-pos);
                    number_type nr = rrr_helper_type::bin_to_nr(bin);
                    rrr_helper_type::set_bt(m_btnr, btnr_pos, nr, space_for_bt);
                }
                btnr_pos += space_for_bt;
                assert(m_rank.size()-1 == ((i+t_k-1)/t_k));
            } else { // handle last empty full block
                assert(m_rank.size()-1 == ((i+t_k-1)/t_k));
            }
            // for technical reasons we add a last element to m_rank
            m_rank[ m_rank.size()-1 ] = sum_rank; // sum_rank contains the total number of set bits in bv
            m_bt = bt_array;
        }