Ejemplo n.º 1
0
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, lcp_tag)
{
    auto event = memory_monitor::event("construct compressed LCP");
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    typedef int_vector<t_width> text_type;
    {
        // (2) check, if the longest common prefix array is cached
        auto event = memory_monitor::event("LCP");
        if (!cache_file_exists(conf::KEY_LCP, config)) {
            {
                auto event = memory_monitor::event("parse input text");
                // (1) check, if the text is cached
                if (!cache_file_exists(KEY_TEXT, config)) {
                    text_type text;
                    load_vector_from_file(text, file, num_bytes);
                    if (contains_no_zero_symbol(text, file)) {
                        append_zero_symbol(text);
                        store_to_cache(text,KEY_TEXT, config);
                    }
                }
                register_cache_file(KEY_TEXT, config);
            }
            {
                // (2) check, if the suffix array is cached
                auto event = memory_monitor::event("SA");
                if (!cache_file_exists(conf::KEY_SA, config)) {
                    construct_sa<t_width>(config);
                }
                register_cache_file(conf::KEY_SA, config);
            }
            if (t_width==8) {
                construct_lcp_semi_extern_PHI(config);
            } else {
                construct_lcp_PHI<t_width>(config);
            }
        }
        register_cache_file(conf::KEY_LCP, config);
    }
    {
        auto event = memory_monitor::event("compressed LCP");
        t_index tmp(config);
        tmp.swap(idx);
    }
    if (config.delete_files) {
        auto event = memory_monitor::event("delete temporary files");
        util::delete_all_files(config.file_map);
    }
}
Ejemplo n.º 2
0
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, csa_tag)
{
    auto event = memory_monitor::event("construct CSA");
    const char* KEY_TEXT = key_text_trait<t_index::alphabet_category::WIDTH>::KEY_TEXT;
    const char* KEY_BWT  = key_bwt_trait<t_index::alphabet_category::WIDTH>::KEY_BWT;
    typedef int_vector<t_index::alphabet_category::WIDTH> text_type;
    {
        auto event = memory_monitor::event("parse input text");
        // (1) check, if the text is cached
        if (!cache_file_exists(KEY_TEXT, config)) {
            text_type text;
            load_vector_from_file(text, file, num_bytes);
            if (contains_no_zero_symbol(text, file)) {
                append_zero_symbol(text);
                store_to_cache(text,KEY_TEXT, config);
            }
        }
        register_cache_file(KEY_TEXT, config);
    }
    {
        // (2) check, if the suffix array is cached
        auto event = memory_monitor::event("SA");
        if (!cache_file_exists(conf::KEY_SA, config)) {
            construct_sa<t_index::alphabet_category::WIDTH>(config);
        }
        register_cache_file(conf::KEY_SA, config);
    }
    {
        //  (3) construct BWT
        auto event = memory_monitor::event("BWT");
        if (!cache_file_exists(KEY_BWT, config)) {
            construct_bwt<t_index::alphabet_category::WIDTH>(config);
        }
        register_cache_file(KEY_BWT, config);
    }
    {
        //  (4) use BWT to construct the CSA
        auto event = memory_monitor::event("construct CSA");
        t_index tmp(config);
        idx.swap(tmp);
    }
    if (config.delete_files) {
        auto event = memory_monitor::event("delete temporary files");
        util::delete_all_files(config.file_map);
    }
}
Ejemplo n.º 3
0
void construct_lcp_PHI(cache_config& config)
{
    static_assert(t_width == 0 or t_width == 8 , "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
    typedef int_vector<>::size_type size_type;
    typedef int_vector<t_width> text_type;
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    int_vector_buffer<> sa_buf(config.file_map[conf::KEY_SA]);
    size_type n = sa_buf.size();

    assert(n > 0);
    if (1 == n) {  // Handle special case: Input only the sentinel character.
        int_vector<> lcp(1, 0);
        store_to_cache(lcp, conf::KEY_LCP, config);
        return;
    }

//	(1) Calculate PHI (stored in array plcp)
    int_vector<> plcp(n, 0, sa_buf.width());
    for (size_type i=0, sai_1 = 0; i < n; ++i) {
        size_type sai = sa_buf[i];
        plcp[ sai ] = sai_1;
        sai_1 = sai;
    }

//  (2) Load text from disk
    text_type text;
    load_from_cache(text, KEY_TEXT, config);

//  (3) Calculate permuted LCP array (text order), called PLCP
    size_type max_l = 0;
    for (size_type i=0, l=0; i < n-1; ++i) {
        size_type phii = plcp[i];
        while (text[i+l] == text[phii+l]) {
            ++l;
        }
        plcp[i] = l;
        if (l) {
            max_l = std::max(max_l, l);
            --l;
        }
    }
    util::clear(text);
    uint8_t lcp_width = bits::hi(max_l)+1;

//	(4) Transform PLCP into LCP
    std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
    size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
    int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width);   // open buffer for lcp
    lcp_buf[0] = 0;
    sa_buf.buffersize(buffer_size);
    for (size_type i=1; i < n; ++i) {
        size_type sai = sa_buf[i];
        lcp_buf[i] = plcp[sai];
    }
    lcp_buf.close();
    register_cache_file(conf::KEY_LCP, config);
}
Ejemplo n.º 4
0
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, cst_tag)
{
    auto event = memory_monitor::event("construct CST");
    const char* KEY_TEXT = key_text_trait<t_index::alphabet_category::WIDTH>::KEY_TEXT;
    const char* KEY_BWT  = key_bwt_trait<t_index::alphabet_category::WIDTH>::KEY_BWT;
    csa_tag csa_t;
    {
        // (1) check, if the compressed suffix array is cached
        typename t_index::csa_type csa;
        if (!cache_file_exists(std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config)) {
            cache_config csa_config(false, config.dir, config.id, config.file_map);
            construct(csa, file, csa_config, num_bytes, csa_t);
            auto event = memory_monitor::event("store CSA");
            config.file_map = csa_config.file_map;
            store_to_cache(csa,std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config);
        }
        register_cache_file(std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config);
    }
    {
        // (2) check, if the longest common prefix array is cached
        auto event = memory_monitor::event("LCP");
        register_cache_file(KEY_TEXT, config);
        register_cache_file(KEY_BWT, config);
        register_cache_file(conf::KEY_SA, config);
        if (!cache_file_exists(conf::KEY_LCP, config)) {
            if (t_index::alphabet_category::WIDTH==8) {
                construct_lcp_semi_extern_PHI(config);
            } else {
                construct_lcp_PHI<t_index::alphabet_category::WIDTH>(config);
            }
        }
        register_cache_file(conf::KEY_LCP, config);
    }
    {
        auto event = memory_monitor::event("CST");
        t_index tmp(config);
        tmp.swap(idx);
    }
    if (config.delete_files) {
        auto event = memory_monitor::event("delete temporary files");
        util::delete_all_files(config.file_map);
    }
}
Ejemplo n.º 5
0
    //! Constructor
    csa_bitcompressed(cache_config& config) {
        std::string text_file = cache_file_name(key_trait<alphabet_type::int_width>::KEY_TEXT,config);
        int_vector_buffer<alphabet_type::int_width> text_buf(text_file);
        int_vector_buffer<>  sa_buf(cache_file_name(conf::KEY_SA,config));
        size_type n = text_buf.size();
        {
            alphabet_type tmp_alphabet(text_buf, n);
            m_alphabet.swap(tmp_alphabet);
        }
        {
            sa_sample_type tmp_sample(config);
            m_sa.swap(tmp_sample);
        }
        set_isa_samples<csa_bitcompressed>(sa_buf, m_isa);

        if (!store_to_file(m_isa, cache_file_name(conf::KEY_ISA,config), true)) {
            throw std::ios_base::failure("#csa_bitcompressed: Cannot store ISA to file system!");
        } else {
            register_cache_file(conf::KEY_ISA, config);
        }
    }
Ejemplo n.º 6
0
void construct_lcp_PHI(cache_config& config)
{
    typedef int_vector<>::size_type size_type;
    typedef int_vector<t_width> text_type;
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    int_vector_file_buffer<> sa_buf(config.file_map[constants::KEY_SA]);
    size_type n = sa_buf.int_vector_size;

    assert(n > 0);
    if (1 == n) {  // Handle special case: Input only the sentinel character.
        int_vector<> lcp(1, 0);
        store_to_cache(lcp, constants::KEY_LCP, config);
        return;
    }

//	(1) Calculate PHI (stored in array plcp)
    int_vector<> plcp(n, 0, sa_buf.width);
    for (size_type i=0, r_sum=0, r=sa_buf.load_next_block(), sai_1 = 0; r_sum < n;) {
        for (; i < r_sum+r; ++i) {
            size_type sai = sa_buf[i-r_sum];
            plcp[ sai ] = sai_1;
            sai_1 = sai;
        }
        r_sum += r; r = sa_buf.load_next_block();
    }

//  (2) Load text from disk
    text_type text;
    load_from_cache(text, KEY_TEXT, config);

//  (3) Calculate permuted LCP array (text order), called PLCP
    size_type max_l = 0;
    for (size_type i=0, l=0; i < n-1; ++i) {
        size_type phii = plcp[i];
        while (text[i+l] == text[phii+l]) {
            ++l;
        }
        plcp[i] = l;
        if (l) {
            max_l = std::max(max_l, l);
            --l;
        }
    }
    util::clear(text);
    uint8_t lcp_width = bits::hi(max_l)+1;

//	(4) Transform PLCP into LCP
    std::string lcp_file = cache_file_name(constants::KEY_LCP, config);
    osfstream lcp_out_buf(lcp_file, std::ios::binary | std::ios::app | std::ios::out);   // open buffer for lcp

    size_type bit_size = n*lcp_width;
    lcp_out_buf.write((char*) &(bit_size), sizeof(bit_size));	// write size of vector
    lcp_out_buf.write((char*) &(lcp_width),sizeof(lcp_width));  // write int_width of vector
    size_type wb = 0;  // bytes written into lcp int_vector

    size_type buffer_size = 1000000; // buffer_size is a multiple of 8!

    int_vector<> lcp_buf(buffer_size, 0, lcp_width);
    lcp_buf[0] = 0;
    sa_buf.reset(buffer_size);
    size_type r = 0;// sa_buf.load_next_block();
    for (size_type i=1, r_sum=0; r_sum < n;) {
        for (; i < r_sum+r; ++i) {
            size_type sai = sa_buf[i-r_sum];
            lcp_buf[ i-r_sum ] = plcp[sai];
        }
        if (r > 0) {
            size_type cur_wb = (r*lcp_buf.width()+7)/8;
            lcp_out_buf.write((const char*)lcp_buf.data(), cur_wb);
            wb += cur_wb;
        }
        r_sum += r; r = sa_buf.load_next_block();
    }
    if (wb%8) {
        lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8);
    }
    lcp_out_buf.close();
    register_cache_file(constants::KEY_LCP, config);
}