rank_bm25(cache_config& cconfig) { uint64_t num_terms; load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; avg_doc_len = (double)num_terms / (double)num_docs; std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl; }
rank_tfidf(cache_config& cconfig) { load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end()); min_doc_len = *min_itr; std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl; }
void construct_lcp_PHI(cache_config& config) { static_assert(t_width == 0 or t_width == 8 , "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet"); typedef int_vector<>::size_type size_type; typedef int_vector<t_width> text_type; const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; int_vector_buffer<> sa_buf(config.file_map[conf::KEY_SA]); size_type n = sa_buf.size(); assert(n > 0); if (1 == n) { // Handle special case: Input only the sentinel character. int_vector<> lcp(1, 0); store_to_cache(lcp, conf::KEY_LCP, config); return; } // (1) Calculate PHI (stored in array plcp) int_vector<> plcp(n, 0, sa_buf.width()); for (size_type i=0, sai_1 = 0; i < n; ++i) { size_type sai = sa_buf[i]; plcp[ sai ] = sai_1; sai_1 = sai; } // (2) Load text from disk text_type text; load_from_cache(text, KEY_TEXT, config); // (3) Calculate permuted LCP array (text order), called PLCP size_type max_l = 0; for (size_type i=0, l=0; i < n-1; ++i) { size_type phii = plcp[i]; while (text[i+l] == text[phii+l]) { ++l; } plcp[i] = l; if (l) { max_l = std::max(max_l, l); --l; } } util::clear(text); uint8_t lcp_width = bits::hi(max_l)+1; // (4) Transform PLCP into LCP std::string lcp_file = cache_file_name(conf::KEY_LCP, config); size_type buffer_size = 1000000; // buffer_size is a multiple of 8! int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp lcp_buf[0] = 0; sa_buf.buffersize(buffer_size); for (size_type i=1; i < n; ++i) { size_type sai = sa_buf[i]; lcp_buf[i] = plcp[sai]; } lcp_buf.close(); register_cache_file(conf::KEY_LCP, config); }
void construct_lcp_kasai(cache_config& config) { int_vector<> lcp; typedef int_vector<>::size_type size_type; construct_isa(config); { int_vector<t_width> text; if (!load_from_cache(text, key_text_trait<t_width>::KEY_TEXT, config)) { return; } int_vector_file_buffer<> isa_buf(config.file_map[constants::KEY_ISA], 1000000); // init isa file_buffer int_vector<> sa; if (!load_from_cache(sa, constants::KEY_SA, config)) { return; } // use Kasai algorithm to compute the lcp values for (size_type i=0,j=0,sa_1=0,l=0, r_sum=0, r=isa_buf.load_next_block(), n=isa_buf.int_vector_size; r_sum < n;) { for (; i < r_sum+r; ++i) { sa_1 = isa_buf[i-r_sum]; // = isa[i] if (sa_1) { j = sa[sa_1-1]; if (l) --l; assert(i!=j); while (text[i+l]==text[j+l]) { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j ++l; } sa[ sa_1-1 ] = l; //overwrite sa array with lcp values } else { l = 0; sa[ n-1 ] = 0; } } r_sum += r; r = isa_buf.load_next_block(); } for (size_type i=sa.size(); i>1; --i) { sa[i-1] = sa[i-2]; } sa[0] = 0; lcp.swap(sa); } store_to_cache(lcp, constants::KEY_LCP, config); }
void construct_lcp_kasai(cache_config& config) { static_assert(t_width == 0 or t_width == 8 , "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet"); int_vector<> lcp; typedef int_vector<>::size_type size_type; construct_isa(config); { int_vector<t_width> text; if (!load_from_cache(text, key_text_trait<t_width>::KEY_TEXT, config)) { return; } int_vector_buffer<> isa_buf(config.file_map[conf::KEY_ISA], std::ios::in, 1000000); // init isa file_buffer int_vector<> sa; if (!load_from_cache(sa, conf::KEY_SA, config)) { return; } // use Kasai algorithm to compute the lcp values for (size_type i=0,j=0,sa_1=0,l=0, n=isa_buf.size(); i < n; ++i) { sa_1 = isa_buf[i]; // = isa[i] if (sa_1) { j = sa[sa_1-1]; if (l) --l; assert(i!=j); while (text[i+l]==text[j+l]) { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j ++l; } sa[ sa_1-1 ] = l; //overwrite sa array with lcp values } else { l = 0; sa[ n-1 ] = 0; } } for (size_type i=sa.size(); i>1; --i) { sa[i-1] = sa[i-2]; } sa[0] = 0; lcp.swap(sa); } store_to_cache(lcp, conf::KEY_LCP, config); }
std::string cache_read() { PROFILE_FUNC(); std::string data; PROFILE_START(action_find); // Starts new action which will be inner to ACTION_READ bool found = find_record(); PROFILE_STOP(action_find); if (!found) { PROFILE_BLOCK(load_from_disk); data = read_from_disk(); put_into_cache(data); return data; // Here all action guards are destructed and actions are correctly finished } data = load_from_cache(); return data; }
int main( int argc, char** argv ) { /* parse command line */ cmdargs_t args = parse_args(argc,argv); /* parse repo */ auto cc = surf::parse_collection(args.collection_dir); sdsl::int_vector_buffer<> T(args.collection_dir+"/"+surf::TEXT_FILENAME); std::cout << "n = |T|= " << T.size() << std::endl; surf::construct_doc_cnt<sdsl::int_alphabet_tag::WIDTH>(cc); uint64_t doc_cnt = 0; load_from_cache(doc_cnt, surf::KEY_DOCCNT, cc); std::cout << "number of documents = N = " << doc_cnt << std::endl; std::ifstream dic_fs(args.collection_dir+"/"+surf::DICT_FILENAME); std::string line; size_t num_terms = 0; while( std::getline(dic_fs,line) ) { num_terms++; } std::cout << "number of terms = sigma = " << num_terms << std::endl; std::cout << "avg document length = " << T.size() / doc_cnt << std::endl; }
void construct_sa(cache_config& config) { static_assert(t_width == 0 or t_width == 8 , "construct_sa: width must be `0` for integer alphabet and `8` for byte alphabet"); const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; if (t_width == 8) { typedef int_vector<t_width> text_type; text_type text; load_from_cache(text, KEY_TEXT, config); // call divsufsort int_vector<> sa(text.size(), 0, bits::hi(text.size())+1); algorithm::calculate_sa((const unsigned char*)text.data(), text.size(), sa); store_to_cache(sa, conf::KEY_SA, config); } else if (t_width == 0) { // call qsufsort int_vector<> sa; sdsl::qsufsort::construct_sa(sa, config.file_map[KEY_TEXT].c_str(), 0); store_to_cache(sa, conf::KEY_SA, config); } else { std::cerr << "Unknown alphabet type" << std::endl; } }
void construct_lcp_PHI(cache_config& config) { typedef int_vector<>::size_type size_type; typedef int_vector<t_width> text_type; const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; int_vector_file_buffer<> sa_buf(config.file_map[constants::KEY_SA]); size_type n = sa_buf.int_vector_size; assert(n > 0); if (1 == n) { // Handle special case: Input only the sentinel character. int_vector<> lcp(1, 0); store_to_cache(lcp, constants::KEY_LCP, config); return; } // (1) Calculate PHI (stored in array plcp) int_vector<> plcp(n, 0, sa_buf.width); for (size_type i=0, r_sum=0, r=sa_buf.load_next_block(), sai_1 = 0; r_sum < n;) { for (; i < r_sum+r; ++i) { size_type sai = sa_buf[i-r_sum]; plcp[ sai ] = sai_1; sai_1 = sai; } r_sum += r; r = sa_buf.load_next_block(); } // (2) Load text from disk text_type text; load_from_cache(text, KEY_TEXT, config); // (3) Calculate permuted LCP array (text order), called PLCP size_type max_l = 0; for (size_type i=0, l=0; i < n-1; ++i) { size_type phii = plcp[i]; while (text[i+l] == text[phii+l]) { ++l; } plcp[i] = l; if (l) { max_l = std::max(max_l, l); --l; } } util::clear(text); uint8_t lcp_width = bits::hi(max_l)+1; // (4) Transform PLCP into LCP std::string lcp_file = cache_file_name(constants::KEY_LCP, config); osfstream lcp_out_buf(lcp_file, std::ios::binary | std::ios::app | std::ios::out); // open buffer for lcp size_type bit_size = n*lcp_width; lcp_out_buf.write((char*) &(bit_size), sizeof(bit_size)); // write size of vector lcp_out_buf.write((char*) &(lcp_width),sizeof(lcp_width)); // write int_width of vector size_type wb = 0; // bytes written into lcp int_vector size_type buffer_size = 1000000; // buffer_size is a multiple of 8! int_vector<> lcp_buf(buffer_size, 0, lcp_width); lcp_buf[0] = 0; sa_buf.reset(buffer_size); size_type r = 0;// sa_buf.load_next_block(); for (size_type i=1, r_sum=0; r_sum < n;) { for (; i < r_sum+r; ++i) { size_type sai = sa_buf[i-r_sum]; lcp_buf[ i-r_sum ] = plcp[sai]; } if (r > 0) { size_type cur_wb = (r*lcp_buf.width()+7)/8; lcp_out_buf.write((const char*)lcp_buf.data(), cur_wb); wb += cur_wb; } r_sum += r; r = sa_buf.load_next_block(); } if (wb%8) { lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8); } lcp_out_buf.close(); register_cache_file(constants::KEY_LCP, config); }