/** * Do an actual search. * * @param table table containing organized entries to search from * @param search_term the query string * @param callback routine to invoke for each match * @param ctx user-supplied data to pass on to callback * @param max_res maximum amount of results to return * @param qhv query hash vector built from query string, for routing * * @return number of hits we produced */ G_GNUC_HOT int st_search( search_table_t *table, const char *search_term, st_search_callback callback, gpointer ctx, int max_res, query_hashvec_t *qhv) { char *search; int key, nres = 0; guint i, len; struct st_bin *best_bin = NULL; int best_bin_size = INT_MAX; word_vec_t *wovec; guint wocnt; cpattern_t **pattern; struct st_entry **vals; guint vcnt; int scanned = 0; /* measure search mask efficiency */ guint32 search_mask; size_t minlen; guint random_offset; /* Randomizer for search returns */ search = UNICODE_CANONIZE(search_term); if (GNET_PROPERTY(query_debug) > 4 && 0 != strcmp(search, search_term)) { char *safe_search = hex_escape(search, FALSE); char *safe_search_term = hex_escape(search_term, FALSE); g_debug("original search term: \"%s\"", safe_search_term); g_debug("canonical search term: \"%s\"", safe_search); if (safe_search != search) HFREE_NULL(safe_search); if (safe_search_term != search_term) HFREE_NULL(safe_search_term); } len = strlen(search); /* * Find smallest bin */ if (len >= 2) { for (i = 0; i < len - 1; i++) { struct st_bin *bin; if (is_ascii_space(search[i]) || is_ascii_space(search[i+1])) continue; key = st_key(table, search + i); if ((bin = table->bins[key]) == NULL) { best_bin = NULL; break; } if (bin->nvals < best_bin_size) { best_bin = bin; best_bin_size = bin->nvals; } } if (GNET_PROPERTY(matching_debug) > 4) g_debug("MATCH st_search(): str=\"%s\", len=%d, best_bin_size=%d", lazy_safe_search(search_term), len, best_bin_size); } /* * If the best_bin is NULL, we did not find a matching bin, and we're * sure we won't be able to find the search string. * * Note that on search strings like "r e m ", we always have a letter * followed by spaces, so we won't search that. * --RAM, 06/10/2001 */ if (best_bin == NULL) { /* * If we have a `qhv', we need to compute the word vector anway, * for query routing... */ if (qhv == NULL) goto finish; } /* * Prepare matching patterns */ wocnt = word_vec_make(search, &wovec); /* * Compute the query hashing information for query routing, if needed. */ if (qhv != NULL) { for (i = 0; i < wocnt; i++) { if (wovec[i].len >= QRP_MIN_WORD_LENGTH) qhvec_add(qhv, wovec[i].word, QUERY_H_WORD); } } if (wocnt == 0 || best_bin == NULL) { if (wocnt > 0) word_vec_free(wovec, wocnt); goto finish; } g_assert(best_bin_size > 0); /* Allocated bin, it must hold something */ pattern = walloc0(wocnt * sizeof *pattern); /* * Prepare matching optimization, an idea from Mike Green. * * At library building time, we computed a mask hash, made from the * lowercased file name, using one bit per different letter, roughly * (see mask_hash() for the exact algorigthm). * * We're now going to compute the same mask on the query, and compare * it bitwise with the mask for each file. If the file does not hold * at least all the chars present in the query, it's no use applying * the pattern matching algorithm, it won't match at all. * * --RAM, 01/10/2001 */ search_mask = mask_hash(search); /* * Prepare second matching optimization: since all words in the query * must match the exact amount of time, we can compute the minimum length * the searched file must have. We add one character after each word * but the last, to account for space between words. * --RAM, 11/07/2002 */ for (minlen = 0, i = 0; i < wocnt; i++) minlen += wovec[i].len + 1; minlen--; g_assert(minlen <= INT_MAX); /* * Search through the smallest bin */ vcnt = best_bin->nvals; vals = best_bin->vals; random_offset = random_u32() % vcnt; nres = 0; for (i = 0; i < vcnt; i++) { const struct st_entry *e; shared_file_t *sf; size_t canonic_len; /* * As we only return a limited count of results, pick a random * offset, so that repeated searches will match different items * instead of always the first - with some probability. */ e = vals[(i + random_offset) % vcnt]; if ((e->mask & search_mask) != search_mask) continue; /* Can't match */ sf = e->sf; canonic_len = shared_file_name_canonic_len(sf); if (canonic_len < minlen) continue; /* Can't match */ scanned++; if (entry_match(e->string, canonic_len, pattern, wovec, wocnt)) { if (GNET_PROPERTY(matching_debug) > 4) { g_debug("MATCH \"%s\" matches %s", search, shared_file_name_nfc(sf)); } if ((*callback)(ctx, sf)) { nres++; if (nres >= max_res) break; } } } if (GNET_PROPERTY(matching_debug) > 3) g_debug("MATCH st_search(): scanned %d entr%s from the %d in bin, " "got %d match%s", scanned, 1 == scanned ? "y" : "ies", best_bin_size, nres, 1 == nres ? "" : "es"); for (i = 0; i < wocnt; i++) if (pattern[i]) /* Lazily compiled by entry_match() */ pattern_free(pattern[i]); wfree(pattern, wocnt * sizeof *pattern); word_vec_free(wovec, wocnt); finish: if (search != search_term) { HFREE_NULL(search); } return nres; }
static int search(struct cmdctx *cmdctx) { struct poclidek_ctx *cctx = NULL; tn_array *pkgs = NULL; tn_array *matched_pkgs = NULL; int i, err = 0, display_bar = 0, bar_v; int term_height; struct pattern *pt; unsigned flags; if ((pt = cmdctx->_data) == NULL) { logn(LOGERR, _("search: no pattern given")); err++; goto l_end; } cmdctx->_data = NULL; /* we'll free pattern myself */ cctx = cmdctx->cctx; flags = cmdctx->_flags; flags &= ~OPT_NO_SEARCHSW; if (flags == 0) cmdctx->_flags |= OPT_SEARCH_DEFAULT; init_pcre(); if (!pattern_compile(pt, poldek_ts_get_arg_count(cmdctx->ts))) { err++; goto l_end; } poclidek_load_packages(cmdctx->cctx, POCLIDEK_LOAD_ALL); if (poldek_ts_get_arg_count(cmdctx->ts) == 0) { pkgs = poclidek_get_dent_packages(cctx, NULL); } else { pkgs = poclidek_resolve_packages(NULL, cctx, cmdctx->ts, 0); } if (pkgs == NULL) return 0; matched_pkgs = n_array_new(32, NULL, NULL); if (n_array_size(pkgs) > 5 && (cmdctx->_flags & OPT_SEARCH_HDD)) { display_bar = 1; msg(0, _("Searching packages...")); } bar_v = 0; for (i=0; i < n_array_size(pkgs); i++) { struct pkg *pkg = n_array_nth(pkgs, i); if (pkg_match(pkg, pt, cmdctx->_flags)) n_array_push(matched_pkgs, pkg); if (display_bar) { int v, j; v = i * 40 / n_array_size(pkgs); for (j = bar_v; j < v; j++) msg(0, "_."); bar_v = v; } if (sigint_reached()) { msgn(0, _("_interrupted.")); goto l_end; } } if (display_bar) msgn(0, _("_done.")); term_height = poldek_term_get_height(); if (n_array_size(matched_pkgs) == 0) cmdctx_printf_c(cmdctx, PRCOLOR_YELLOW, "!No package matches '%s'\n", pt->regexp); else if (n_array_size(matched_pkgs) < term_height) cmdctx_printf_c(cmdctx, PRCOLOR_YELLOW, "!%d package(s) found:\n", n_array_size(matched_pkgs)); for (i=0; i<n_array_size(matched_pkgs); i++) { struct pkg *pkg; pkg = n_array_nth(matched_pkgs, i); cmdctx_addtoresult(cmdctx, pkg); cmdctx_printf(cmdctx, "%s\n", pkg_id(pkg)); } if (n_array_size(matched_pkgs) >= term_height) cmdctx_printf_c(cmdctx, PRCOLOR_YELLOW, "!%d package(s) found.\n", n_array_size(matched_pkgs)); l_end: if (pkgs) n_array_free(pkgs); if (matched_pkgs) n_array_free(matched_pkgs); if (cmdctx->_data) cmdctx->_data = NULL; if (pt) pattern_free(pt); return 1; }