예제 #1
0
        /*!
         *  \param text_buf Byte stream.
         *  \param len      Length of the byte stream.
         */
        succinct_byte_alphabet(int_vector_buffer<8>& text_buf, int_vector_size_type len):
            char2comp(this), comp2char(this), C(m_C), sigma(m_sigma)
        {
            m_sigma = 0;
            if (0 == len or 0 == text_buf.size())
                return;
            assert(len <= text_buf.size());
            // initialize vectors
            int_vector<64> D(257, 0);
            bit_vector tmp_char(256, 0);
            // count occurrences of each symbol
            for (size_type i=0; i < len; ++i) {
                ++D[text_buf[i]];
            }
            assert(1 == D[0]); // null-byte should occur exactly once
            m_sigma = 0;
            for (int i=0; i<256; ++i)
                if (D[i]) {
                    tmp_char[i] = 1;    // mark occurring character
                    D[m_sigma] = D[i];  // compactify m_C
                    ++m_sigma;
                }
            // resize to sigma+1, since CSAs also need the sum of all elements
            m_C = C_type(m_sigma+1, 0, bits::hi(len)+1);

            for (int i=(int)m_sigma; i > 0; --i) m_C[i] = D[i-1];
            m_C[0] = 0;
            for (int i=1; i <= (int)m_sigma; ++i) m_C[i] = m_C[i] + m_C[i-1];
            assert(m_C[sigma]==len);
            m_char = tmp_char;
            util::init_support(m_char_rank, &m_char);
            util::init_support(m_char_select, &m_char);
        }
예제 #2
0
 void
 construct_D_array(int_vector_buffer<0>& sa_buf,
                   bit_vector::rank_1_type& doc_border_rank,
                   const size_type doc_cnt,
                   int_vector<>& D) {
     D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt+1)+1);
     for (size_type i = 0; i < sa_buf.size(); ++i) {
         uint64_t d = doc_border_rank(sa_buf[i]+1);
         D[i] = d;
     }
 }
예제 #3
0
bit_vector::size_type construct_supercartesian_tree_bp_succinct_and_first_child(
int_vector_buffer<t_width>& lcp_buf, bit_vector& bp, bit_vector& bp_fc, const bool minimum = true)
{
	typedef bit_vector::size_type size_type;
	size_type					  n = lcp_buf.size();
	bp.resize(2 * n); // resize bit vector for balanced parentheses to 2 n bits
	bp_fc.resize(n);
	if (n == 0) // if n == 0 we are done
		return 0;
	size_type fc_cnt = 0; // first child counter
	util::set_to_value(bp, 0);
	util::set_to_value(bp_fc, 0);
	sorted_multi_stack_support vec_stack(n);

	size_type k	= 0;
	size_type k_fc = 0; // first child index
	if (minimum) {
		// no "lazy stack" trick used here
		for (size_type i = 0, x; i < n; ++i) {
			x = lcp_buf[i];
			while (!vec_stack.empty() and x < vec_stack.top()) {
				if (vec_stack.pop()) {
					bp_fc[k_fc] = 1;
					++fc_cnt;
				}
				++k;	// writing a closing parenthesis, bp is already initialized to zeros
				++k_fc; // write a bit in first_child
			}
			vec_stack.push(x);
			bp[k++] = 1; // writing an opening parenthesis
		}

	} else {
		// no "lazy stack" trick used here
		for (size_type i = 0, x; i < n; ++i) {
			x = lcp_buf[i];
			while (!vec_stack.empty() and x > vec_stack.top()) {
				if (vec_stack.pop()) {
					bp_fc[k_fc] = 1;
					++fc_cnt;
				}
				++k;	// writing a closing parenthesis, bp is already initialized to zeros
				++k_fc; // write a bit in first_child
			}
			vec_stack.push(x);
			bp[k++] = 1; // writing an opening parenthesis
		}
	}
	while (!vec_stack.empty()) {
		if (vec_stack.pop()) {
			bp_fc[k_fc] = 1;
			++fc_cnt;
		}
		// writing a closing parenthesis in bp, not necessary as bp is initialized with zeros
		++k;
		++k_fc;
	}
	return fc_cnt;
}
예제 #4
0
bit_vector construct_supercartesian_tree_bp_succinct(int_vector_buffer<t_width>& lcp_buf,
													 const bool					 minimum = true)
{
	typedef bit_vector::size_type size_type;
	bit_vector					  bp(2 * lcp_buf.size(), 0); // initialize result
	if (lcp_buf.size() > 0) {
		sorted_multi_stack_support vec_stack(lcp_buf.size());

		size_type k = 0;
		if (minimum) {
			bp[k++]		   = 1;
			size_type last = lcp_buf[0];
			for (size_type i = 1, x; i < lcp_buf.size(); ++i) {
				x = lcp_buf[i];
				if (x < last) {
					++k; // writing a closing parenthesis for last
					while (!vec_stack.empty() and x < vec_stack.top()) {
						vec_stack.pop();
						++k; // writing a closing parenthesis, bp is already initialized to zeros
					}
				} else {
					vec_stack.push(last); // "lazy stack" trick: speed-up about 25 %
				}
				bp[k++] = 1; // writing an opening parenthesis
				last	= x;
			}
		} else {
			// no "lazy stack" trick use here
			for (size_type i = 0, x; i < lcp_buf.size(); ++i) {
				x = lcp_buf[i];
				while (!vec_stack.empty() and x > vec_stack.top()) {
					vec_stack.pop();
					++k; // writing a closing parenthesis, bp is already initialized to zeros
				}
				vec_stack.push(x);
				bp[k++] = 1; // writing an opening parenthesis
			}
		}
	}
	return bp;
}
예제 #5
0
        /*!
         *  \param text_buf Byte stream.
         *  \param len      Length of the byte stream.
         */
        int_alphabet(int_vector_buffer<0>& text_buf, int_vector_size_type len):
            char2comp(this), comp2char(this), C(m_C), sigma(m_sigma)
        {
            m_sigma = 0;
            if (0 == len or 0 == text_buf.size())
                return;
            assert(len <= text_buf.size());
            // initialize vectors
            std::map<size_type, size_type> D;
            // count occurrences of each symbol
            for (size_type i=0; i < len; ++i) {
                D[text_buf[i]]++;
            }
            m_sigma = D.size();
            if (is_continuous_alphabet(D)) {
                // do not initialize m_char, m_char_rank and m_char_select since we can map directly
            } else {
                // note: the alphabet has at least size 1, so the following is safe:
                size_type largest_symbol = (--D.end())->first;
                bit_vector tmp_char(largest_symbol+1, 0);
                for (std::map<size_type, size_type>::const_iterator it = D.begin(), end=D.end(); it != end; ++it) {
                    tmp_char[it->first] = 1;
                }
                m_char = tmp_char;
                util::init_support(m_char_rank, &m_char);
                util::init_support(m_char_select, &m_char);
            }
            assert(D.find(0) != D.end() and 1 == D[0]); // null-byte should occur exactly once

            // resize to sigma+1, since CSAs also need the sum of all elements
            m_C = C_type(m_sigma+1, 0, bits::hi(len)+1);
            size_type sum = 0, idx=0;
            for (std::map<size_type, size_type>::const_iterator it = D.begin(), end=D.end(); it != end; ++it) {
                m_C[idx++] = sum;
                sum += it->second;
            }
            m_C[idx] = sum;  // insert sum of all elements
        }
예제 #6
0
        /*! \param text_buf A int_vector_buffer to the original text.
         *  \param size     The length of the prefix of the text, for which the wavelet tree should be build.
         */
        wt_int_rlmn(int_vector_buffer<>& text_buf, size_type size):m_size(size), sigma(m_wt.sigma) {
            if (0 == text_buf.size() or 0 == size)
                return;
            int_vector<> condensed_bwt;
            {
                // scope for bl and bf
                bit_vector bl = bit_vector(size, 0);
                std::map<uint64_t, uint64_t> C;
                uint64_t last_c = 0;
                size_type runs = 0;
                for (size_type i=0; i < size; ++i) {
                    uint64_t c = text_buf[i];
                    if (last_c != c or i==0) {
                        bl[i] = 1;
                        ++runs;
                    }
                    ++C[c];
                    last_c = c;
                }
                uint64_t max_symbol = (--C.end())->first;
                m_C = int_vector<>(max_symbol+1, 0, bits::hi(size)+1);
                for (size_type i=0, prefix_sum=0; i<=max_symbol; ++i) {
                    m_C[i] = prefix_sum;
                    prefix_sum += C[i];
                }

                int_vector<> lf_map = m_C;
                bit_vector bf = bit_vector(size+1, 0);
                bf[size] = 1; // initialize last element
                condensed_bwt = int_vector<>(runs, 0, bits::hi(max_symbol)+1);
                runs = 0;
                for (size_type i=0; i < size; ++i) {
                    uint64_t c = text_buf[i];
                    if (bl[i]) {
                        bf[lf_map[c]] = 1;
                        condensed_bwt[runs++] = c;
                    }
                    ++lf_map[c];
                }
                {
                    // TODO: remove absolute file name
                    std::string temp_file = "tmp_wt_int_rlmn_" + util::to_string(util::pid()) + "_" + util::to_string(util::id());
                    store_to_file(condensed_bwt, temp_file);
                    util::clear(condensed_bwt);
                    int_vector_buffer<> temp_bwt_buf(temp_file);
                    m_wt = std::move(wt_type(temp_bwt_buf, temp_bwt_buf.size()));
                    temp_bwt_buf.close(true);
                }
                m_bl = std::move(bit_vector_type(bl));
                m_bf = std::move(bit_vector_type(bf));
            }

            util::init_support(m_bl_rank, &m_bl);
            util::init_support(m_bf_rank, &m_bf);
            util::init_support(m_bf_select, &m_bf);
            util::init_support(m_bl_select, &m_bl);
            m_C_bf_rank = int_vector<>(m_C.size(), 0, bits::hi(size)+1);
            for (size_type i=0; i<m_C.size(); ++i) {
                m_C_bf_rank[i] = m_bf_rank(m_C[i]);
            }
        }