void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, lcp_tag) { auto event = memory_monitor::event("construct compressed LCP"); const char* KEY_TEXT = key_text_trait<t_width>::KEY_TEXT; typedef int_vector<t_width> text_type; { // (2) check, if the longest common prefix array is cached auto event = memory_monitor::event("LCP"); if (!cache_file_exists(conf::KEY_LCP, config)) { { auto event = memory_monitor::event("parse input text"); // (1) check, if the text is cached if (!cache_file_exists(KEY_TEXT, config)) { text_type text; load_vector_from_file(text, file, num_bytes); if (contains_no_zero_symbol(text, file)) { append_zero_symbol(text); store_to_cache(text,KEY_TEXT, config); } } register_cache_file(KEY_TEXT, config); } { // (2) check, if the suffix array is cached auto event = memory_monitor::event("SA"); if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_width>(config); } register_cache_file(conf::KEY_SA, config); } if (t_width==8) { construct_lcp_semi_extern_PHI(config); } else { construct_lcp_PHI<t_width>(config); } } register_cache_file(conf::KEY_LCP, config); } { auto event = memory_monitor::event("compressed LCP"); t_index tmp(config); tmp.swap(idx); } if (config.delete_files) { auto event = memory_monitor::event("delete temporary files"); util::delete_all_files(config.file_map); } }
static bool cache_file_valid(struct file *f) { char *cp; int result; struct stat st; if(!cache_valid(f)) return false; if(!cache_file_exists(f)) return false; cp = cache_path(f); result = stat(cp, &st); free(cp); if(result != 0) return false; if(f->st->st_mtime > st.st_mtime) return false; return true; }
static enum path_treatment treat_one_path(struct dir_struct *dir, struct strbuf *path, const struct path_simplify *simplify, int dtype, struct dirent *de) { int exclude; int has_path_in_index = !!cache_file_exists(path->buf, path->len, ignore_case); if (dtype == DT_UNKNOWN) dtype = get_dtype(de, path->buf, path->len); /* Always exclude indexed files */ if (dtype != DT_DIR && has_path_in_index) return path_none; /* * When we are looking at a directory P in the working tree, * there are three cases: * * (1) P exists in the index. Everything inside the directory P in * the working tree needs to go when P is checked out from the * index. * * (2) P does not exist in the index, but there is P/Q in the index. * We know P will stay a directory when we check out the contents * of the index, but we do not know yet if there is a directory * P/Q in the working tree to be killed, so we need to recurse. * * (3) P does not exist in the index, and there is no P/Q in the index * to require P to be a directory, either. Only in this case, we * know that everything inside P will not be killed without * recursing. */ if ((dir->flags & DIR_COLLECT_KILLED_ONLY) && (dtype == DT_DIR) && !has_path_in_index && (directory_exists_in_index(path->buf, path->len) == index_nonexistent)) return path_none; exclude = is_excluded(dir, path->buf, &dtype); /* * Excluded? If we don't explicitly want to show * ignored files, ignore it */ if (exclude && !(dir->flags & (DIR_SHOW_IGNORED|DIR_SHOW_IGNORED_TOO))) return path_excluded; switch (dtype) { default: return path_none; case DT_DIR: strbuf_addch(path, '/'); return treat_directory(dir, path->buf, path->len, exclude, simplify); case DT_REG: case DT_LNK: return exclude ? path_excluded : path_untracked; } }
csa_wt<t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat>::csa_wt(cache_config& config) { if (!cache_file_exists(key_trait<alphabet_type::int_width>::KEY_BWT, config)) { return; } { auto event = memory_monitor::event("construct csa-alpbabet"); int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config)); size_type n = bwt_buf.size(); alphabet_type tmp_alphabet(bwt_buf, n); m_alphabet.swap(tmp_alphabet); } { auto event = memory_monitor::event("construct wavelet tree"); int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config)); size_type n = bwt_buf.size(); wavelet_tree_type tmp_wt(bwt_buf, n); m_wavelet_tree.swap(tmp_wt); } { auto event = memory_monitor::event("sample SA"); sa_sample_type tmp_sa_sample(config); m_sa_sample.swap(tmp_sa_sample); } { auto event = memory_monitor::event("sample ISA"); isa_sample_type isa_s(config, &m_sa_sample); util::swap_support(m_isa_sample, isa_s, &m_sa_sample, &m_sa_sample); } }
static struct dir_entry *dir_add_name(struct dir_struct *dir, const char *pathname, int len) { if (cache_file_exists(pathname, len, ignore_case)) return NULL; ALLOC_GROW(dir->entries, dir->nr+1, dir->alloc); return dir->entries[dir->nr++] = dir_entry_new(pathname, len); }
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, csa_tag) { auto event = memory_monitor::event("construct CSA"); const char* KEY_TEXT = key_text_trait<t_index::alphabet_category::WIDTH>::KEY_TEXT; const char* KEY_BWT = key_bwt_trait<t_index::alphabet_category::WIDTH>::KEY_BWT; typedef int_vector<t_index::alphabet_category::WIDTH> text_type; { auto event = memory_monitor::event("parse input text"); // (1) check, if the text is cached if (!cache_file_exists(KEY_TEXT, config)) { text_type text; load_vector_from_file(text, file, num_bytes); if (contains_no_zero_symbol(text, file)) { append_zero_symbol(text); store_to_cache(text,KEY_TEXT, config); } } register_cache_file(KEY_TEXT, config); } { // (2) check, if the suffix array is cached auto event = memory_monitor::event("SA"); if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_index::alphabet_category::WIDTH>(config); } register_cache_file(conf::KEY_SA, config); } { // (3) construct BWT auto event = memory_monitor::event("BWT"); if (!cache_file_exists(KEY_BWT, config)) { construct_bwt<t_index::alphabet_category::WIDTH>(config); } register_cache_file(KEY_BWT, config); } { // (4) use BWT to construct the CSA auto event = memory_monitor::event("construct CSA"); t_index tmp(config); idx.swap(tmp); } if (config.delete_files) { auto event = memory_monitor::event("delete temporary files"); util::delete_all_files(config.file_map); } }
void construct(t_index& idx, const std::string& file, cache_config& config, uint8_t num_bytes, cst_tag) { auto event = memory_monitor::event("construct CST"); const char* KEY_TEXT = key_text_trait<t_index::alphabet_category::WIDTH>::KEY_TEXT; const char* KEY_BWT = key_bwt_trait<t_index::alphabet_category::WIDTH>::KEY_BWT; csa_tag csa_t; { // (1) check, if the compressed suffix array is cached typename t_index::csa_type csa; if (!cache_file_exists(std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config)) { cache_config csa_config(false, config.dir, config.id, config.file_map); construct(csa, file, csa_config, num_bytes, csa_t); auto event = memory_monitor::event("store CSA"); config.file_map = csa_config.file_map; store_to_cache(csa,std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config); } register_cache_file(std::string(conf::KEY_CSA)+"_"+util::class_to_hash(csa), config); } { // (2) check, if the longest common prefix array is cached auto event = memory_monitor::event("LCP"); register_cache_file(KEY_TEXT, config); register_cache_file(KEY_BWT, config); register_cache_file(conf::KEY_SA, config); if (!cache_file_exists(conf::KEY_LCP, config)) { if (t_index::alphabet_category::WIDTH==8) { construct_lcp_semi_extern_PHI(config); } else { construct_lcp_PHI<t_index::alphabet_category::WIDTH>(config); } } register_cache_file(conf::KEY_LCP, config); } { auto event = memory_monitor::event("CST"); t_index tmp(config); tmp.swap(idx); } if (config.delete_files) { auto event = memory_monitor::event("delete temporary files"); util::delete_all_files(config.file_map); } }
rank_bm25(cache_config& cconfig) { uint64_t num_terms; load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; avg_doc_len = (double)num_terms / (double)num_docs; std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl; }
rank_tfidf(cache_config& cconfig) { load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig); } load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); num_docs = doc_lengths.size(); std::cerr<<"num_docs = "<<num_docs<<std::endl; auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end()); min_doc_len = *min_itr; std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl; }
csa_sada<t_enc_vec, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat>::csa_sada(cache_config& config) { create_buffer(); if (!cache_file_exists(key_trait<alphabet_type::int_width>::KEY_BWT, config)) { return; } int_vector_buffer<alphabet_type::int_width> bwt_buf(cache_file_name(key_trait<alphabet_type::int_width>::KEY_BWT,config)); size_type n = bwt_buf.size(); { auto event = memory_monitor::event("construct csa-alpbabet"); alphabet_type tmp_alphabet(bwt_buf, n); m_alphabet.swap(tmp_alphabet); } int_vector<> cnt_chr(sigma, 0, bits::hi(n)+1); for (typename alphabet_type::sigma_type i=0; i < sigma; ++i) { cnt_chr[i] = C[i]; } // calculate psi { auto event = memory_monitor::event("construct PSI"); // TODO: move PSI construct into construct_PSI.hpp int_vector<> psi(n, 0, bits::hi(n)+1); for (size_type i=0; i < n; ++i) { psi[ cnt_chr[ char2comp[bwt_buf[i]] ]++ ] = i; } std::string psi_file = cache_file_name(conf::KEY_PSI, config); if (!store_to_cache(psi, conf::KEY_PSI, config)) { return; } } { auto event = memory_monitor::event("encode PSI"); int_vector_buffer<> psi_buf(cache_file_name(conf::KEY_PSI, config)); t_enc_vec tmp_psi(psi_buf); m_psi.swap(tmp_psi); } { auto event = memory_monitor::event("sample SA"); sa_sample_type tmp_sa_sample(config); m_sa_sample.swap(tmp_sa_sample); } { auto event = memory_monitor::event("sample ISA"); isa_sample_type isa_s(config, &m_sa_sample); util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (const sa_sample_type*)nullptr); } }
static int get_index_dtype(const char *path, int len) { int pos; const struct cache_entry *ce; ce = cache_file_exists(path, len, 0); if (ce) { if (!ce_uptodate(ce)) return DT_UNKNOWN; if (S_ISGITLINK(ce->ce_mode)) return DT_DIR; /* * Nobody actually cares about the * difference between DT_LNK and DT_REG */ return DT_REG; } /* Try to look it up as a directory */ pos = cache_name_pos(path, len); if (pos >= 0) return DT_UNKNOWN; pos = -pos-1; while (pos < active_nr) { ce = active_cache[pos++]; if (strncmp(ce->name, path, len)) break; if (ce->name[len] > '/') break; if (ce->name[len] < '/') continue; if (!ce_uptodate(ce)) break; /* continue? */ return DT_DIR; } return DT_UNKNOWN; }