Ejemplo n.º 1
0
	rank_bm25(cache_config& cconfig) {
		uint64_t num_terms;
        load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
        if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
            surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
        }
        load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
		num_docs = doc_lengths.size();
        std::cerr<<"num_docs = "<<num_docs<<std::endl;
	    avg_doc_len = (double)num_terms / (double)num_docs;
        std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl;
	}
Ejemplo n.º 2
0
	rank_tfidf(cache_config& cconfig) {
        load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
        if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
            surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
        }
        load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
		num_docs = doc_lengths.size();
        std::cerr<<"num_docs = "<<num_docs<<std::endl;
	    auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end());
	    min_doc_len = *min_itr;
        std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl;
	}
Ejemplo n.º 3
0
void construct_lcp_PHI(cache_config& config)
{
    static_assert(t_width == 0 or t_width == 8 , "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
    typedef int_vector<>::size_type size_type;
    typedef int_vector<t_width> text_type;
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    int_vector_buffer<> sa_buf(config.file_map[conf::KEY_SA]);
    size_type n = sa_buf.size();

    assert(n > 0);
    if (1 == n) {  // Handle special case: Input only the sentinel character.
        int_vector<> lcp(1, 0);
        store_to_cache(lcp, conf::KEY_LCP, config);
        return;
    }

//	(1) Calculate PHI (stored in array plcp)
    int_vector<> plcp(n, 0, sa_buf.width());
    for (size_type i=0, sai_1 = 0; i < n; ++i) {
        size_type sai = sa_buf[i];
        plcp[ sai ] = sai_1;
        sai_1 = sai;
    }

//  (2) Load text from disk
    text_type text;
    load_from_cache(text, KEY_TEXT, config);

//  (3) Calculate permuted LCP array (text order), called PLCP
    size_type max_l = 0;
    for (size_type i=0, l=0; i < n-1; ++i) {
        size_type phii = plcp[i];
        while (text[i+l] == text[phii+l]) {
            ++l;
        }
        plcp[i] = l;
        if (l) {
            max_l = std::max(max_l, l);
            --l;
        }
    }
    util::clear(text);
    uint8_t lcp_width = bits::hi(max_l)+1;

//	(4) Transform PLCP into LCP
    std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
    size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
    int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width);   // open buffer for lcp
    lcp_buf[0] = 0;
    sa_buf.buffersize(buffer_size);
    for (size_type i=1; i < n; ++i) {
        size_type sai = sa_buf[i];
        lcp_buf[i] = plcp[sai];
    }
    lcp_buf.close();
    register_cache_file(conf::KEY_LCP, config);
}
Ejemplo n.º 4
0
void construct_lcp_kasai(cache_config& config)
{
    int_vector<> lcp;
    typedef int_vector<>::size_type size_type;
    construct_isa(config);
    {
        int_vector<t_width> text;
        if (!load_from_cache(text, key_text_trait<t_width>::KEY_TEXT, config)) {
            return;
        }
        int_vector_file_buffer<> isa_buf(config.file_map[constants::KEY_ISA], 1000000);   // init isa file_buffer
        int_vector<> sa;
        if (!load_from_cache(sa, constants::KEY_SA, config)) {
            return;
        }
        // use Kasai algorithm to compute the lcp values
        for (size_type i=0,j=0,sa_1=0,l=0, r_sum=0, r=isa_buf.load_next_block(), n=isa_buf.int_vector_size; r_sum < n;) {
            for (; i < r_sum+r; ++i) {
                sa_1 =  isa_buf[i-r_sum]; // = isa[i]
                if (sa_1) {
                    j = sa[sa_1-1];
                    if (l) --l;
                    assert(i!=j);
                    while (text[i+l]==text[j+l]) { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j
                        ++l;
                    }
                    sa[ sa_1-1 ] = l; //overwrite sa array with lcp values
                } else {
                    l = 0;
                    sa[ n-1 ] = 0;
                }
            }
            r_sum += r;
            r = isa_buf.load_next_block();
        }

        for (size_type i=sa.size(); i>1; --i) {
            sa[i-1] = sa[i-2];
        }
        sa[0] = 0;
        lcp.swap(sa);
    }
    store_to_cache(lcp, constants::KEY_LCP, config);
}
Ejemplo n.º 5
0
void construct_lcp_kasai(cache_config& config)
{
    static_assert(t_width == 0 or t_width == 8 , "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet");
    int_vector<> lcp;
    typedef int_vector<>::size_type size_type;
    construct_isa(config);
    {
        int_vector<t_width> text;
        if (!load_from_cache(text, key_text_trait<t_width>::KEY_TEXT, config)) {
            return;
        }
        int_vector_buffer<> isa_buf(config.file_map[conf::KEY_ISA], std::ios::in, 1000000);   // init isa file_buffer
        int_vector<> sa;
        if (!load_from_cache(sa, conf::KEY_SA, config)) {
            return;
        }
        // use Kasai algorithm to compute the lcp values
        for (size_type i=0,j=0,sa_1=0,l=0, n=isa_buf.size(); i < n; ++i) {
            sa_1 =  isa_buf[i]; // = isa[i]
            if (sa_1) {
                j = sa[sa_1-1];
                if (l) --l;
                assert(i!=j);
                while (text[i+l]==text[j+l]) { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j
                    ++l;
                }
                sa[ sa_1-1 ] = l; //overwrite sa array with lcp values
            } else {
                l = 0;
                sa[ n-1 ] = 0;
            }
        }

        for (size_type i=sa.size(); i>1; --i) {
            sa[i-1] = sa[i-2];
        }
        sa[0] = 0;
        lcp.swap(sa);
    }
    store_to_cache(lcp, conf::KEY_LCP, config);
}
Ejemplo n.º 6
0
std::string cache_read() {
	PROFILE_FUNC();

	std::string data;

	PROFILE_START(action_find); // Starts new action which will be inner to ACTION_READ
	bool found = find_record();
	PROFILE_STOP(action_find);

	if (!found) {
		PROFILE_BLOCK(load_from_disk);

		data = read_from_disk();
		put_into_cache(data);
		return data; // Here all action guards are destructed and actions are correctly finished
	}
	data = load_from_cache();

	return data;
}
Ejemplo n.º 7
0
int main( int argc, char** argv ) {
    /* parse command line */
    cmdargs_t args = parse_args(argc,argv);

    /* parse repo */
    auto cc = surf::parse_collection(args.collection_dir);
    sdsl::int_vector_buffer<> T(args.collection_dir+"/"+surf::TEXT_FILENAME);
    std::cout << "n = |T|= " << T.size() << std::endl;
    surf::construct_doc_cnt<sdsl::int_alphabet_tag::WIDTH>(cc);
    uint64_t doc_cnt = 0;
    load_from_cache(doc_cnt, surf::KEY_DOCCNT, cc);
    std::cout << "number of documents = N = " << doc_cnt << std::endl;
	std::ifstream dic_fs(args.collection_dir+"/"+surf::DICT_FILENAME);
	std::string line;
	size_t num_terms = 0;
	while( std::getline(dic_fs,line) ) {
		num_terms++;
	}
	std::cout << "number of terms = sigma = " << num_terms << std::endl;
	std::cout << "avg document length = " << T.size() / doc_cnt << std::endl;
}
Ejemplo n.º 8
0
void construct_sa(cache_config& config)
{
    static_assert(t_width == 0 or t_width == 8 , "construct_sa: width must be `0` for integer alphabet and `8` for byte alphabet");
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    if (t_width == 8) {
        typedef int_vector<t_width> text_type;
        text_type text;
        load_from_cache(text, KEY_TEXT, config);
        // call divsufsort
        int_vector<> sa(text.size(), 0, bits::hi(text.size())+1);
        algorithm::calculate_sa((const unsigned char*)text.data(), text.size(), sa);
        store_to_cache(sa, conf::KEY_SA, config);
    } else if (t_width == 0) {
        // call qsufsort
        int_vector<> sa;
        sdsl::qsufsort::construct_sa(sa, config.file_map[KEY_TEXT].c_str(), 0);
        store_to_cache(sa, conf::KEY_SA, config);
    } else {
        std::cerr << "Unknown alphabet type" << std::endl;
    }
}
Ejemplo n.º 9
0
void construct_lcp_PHI(cache_config& config)
{
    typedef int_vector<>::size_type size_type;
    typedef int_vector<t_width> text_type;
    const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
    int_vector_file_buffer<> sa_buf(config.file_map[constants::KEY_SA]);
    size_type n = sa_buf.int_vector_size;

    assert(n > 0);
    if (1 == n) {  // Handle special case: Input only the sentinel character.
        int_vector<> lcp(1, 0);
        store_to_cache(lcp, constants::KEY_LCP, config);
        return;
    }

//	(1) Calculate PHI (stored in array plcp)
    int_vector<> plcp(n, 0, sa_buf.width);
    for (size_type i=0, r_sum=0, r=sa_buf.load_next_block(), sai_1 = 0; r_sum < n;) {
        for (; i < r_sum+r; ++i) {
            size_type sai = sa_buf[i-r_sum];
            plcp[ sai ] = sai_1;
            sai_1 = sai;
        }
        r_sum += r; r = sa_buf.load_next_block();
    }

//  (2) Load text from disk
    text_type text;
    load_from_cache(text, KEY_TEXT, config);

//  (3) Calculate permuted LCP array (text order), called PLCP
    size_type max_l = 0;
    for (size_type i=0, l=0; i < n-1; ++i) {
        size_type phii = plcp[i];
        while (text[i+l] == text[phii+l]) {
            ++l;
        }
        plcp[i] = l;
        if (l) {
            max_l = std::max(max_l, l);
            --l;
        }
    }
    util::clear(text);
    uint8_t lcp_width = bits::hi(max_l)+1;

//	(4) Transform PLCP into LCP
    std::string lcp_file = cache_file_name(constants::KEY_LCP, config);
    osfstream lcp_out_buf(lcp_file, std::ios::binary | std::ios::app | std::ios::out);   // open buffer for lcp

    size_type bit_size = n*lcp_width;
    lcp_out_buf.write((char*) &(bit_size), sizeof(bit_size));	// write size of vector
    lcp_out_buf.write((char*) &(lcp_width),sizeof(lcp_width));  // write int_width of vector
    size_type wb = 0;  // bytes written into lcp int_vector

    size_type buffer_size = 1000000; // buffer_size is a multiple of 8!

    int_vector<> lcp_buf(buffer_size, 0, lcp_width);
    lcp_buf[0] = 0;
    sa_buf.reset(buffer_size);
    size_type r = 0;// sa_buf.load_next_block();
    for (size_type i=1, r_sum=0; r_sum < n;) {
        for (; i < r_sum+r; ++i) {
            size_type sai = sa_buf[i-r_sum];
            lcp_buf[ i-r_sum ] = plcp[sai];
        }
        if (r > 0) {
            size_type cur_wb = (r*lcp_buf.width()+7)/8;
            lcp_out_buf.write((const char*)lcp_buf.data(), cur_wb);
            wb += cur_wb;
        }
        r_sum += r; r = sa_buf.load_next_block();
    }
    if (wb%8) {
        lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8);
    }
    lcp_out_buf.close();
    register_cache_file(constants::KEY_LCP, config);
}