static void search_stats_notify_word(query_type_t type, const gchar *search, const host_addr_t unused_addr, guint16 unused_port) { word_vec_t *wovec; guint wocnt; guint i; gchar *buf; (void) unused_addr; (void) unused_port; if (QUERY_SHA1 == type) return; buf = g_strdup(search); wocnt = word_vec_make(buf, &wovec); if (wocnt != 0) { for (i = 0; i < wocnt; i++) search_stats_tally(&wovec[i]); word_vec_free(wovec, wocnt); } G_FREE_NULL(buf); }
/** * Fill non-NULL query hash vector for query routing. * * This needs to be called when st_search() is not called when processing * a query, otherwise the qhery hash vector won't be properly initialized * and the query would be improperly dropped by qrt_build_query_target(), * hence never routed. */ void st_fill_qhv(const char *search_term, query_hashvec_t *qhv) { char *search; word_vec_t *wovec; guint wocnt; guint i; if (NULL == qhv) return; search = UNICODE_CANONIZE(search_term); wocnt = word_vec_make(search, &wovec); for (i = 0; i < wocnt; i++) { if (wovec[i].len >= QRP_MIN_WORD_LENGTH) qhvec_add(qhv, wovec[i].word, QUERY_H_WORD); } if (search != search_term) HFREE_NULL(search); if (wocnt > 0) word_vec_free(wovec, wocnt); }
static void search_stats_notify_word(query_type_t type, const char *search, const host_addr_t unused_addr, guint16 unused_port) { word_vec_t *wovec; unsigned wocnt; (void) unused_addr; (void) unused_port; if (type == QUERY_SHA1) return; wocnt = word_vec_make(search, &wovec); if (wocnt != 0) { unsigned i; for (i = 0; i < wocnt; i++) { search_stats_tally(&wovec[i]); } word_vec_free(wovec, wocnt); } }
/** * Do an actual search. * * @param table table containing organized entries to search from * @param search_term the query string * @param callback routine to invoke for each match * @param ctx user-supplied data to pass on to callback * @param max_res maximum amount of results to return * @param qhv query hash vector built from query string, for routing * * @return number of hits we produced */ G_GNUC_HOT int st_search( search_table_t *table, const char *search_term, st_search_callback callback, gpointer ctx, int max_res, query_hashvec_t *qhv) { char *search; int key, nres = 0; guint i, len; struct st_bin *best_bin = NULL; int best_bin_size = INT_MAX; word_vec_t *wovec; guint wocnt; cpattern_t **pattern; struct st_entry **vals; guint vcnt; int scanned = 0; /* measure search mask efficiency */ guint32 search_mask; size_t minlen; guint random_offset; /* Randomizer for search returns */ search = UNICODE_CANONIZE(search_term); if (GNET_PROPERTY(query_debug) > 4 && 0 != strcmp(search, search_term)) { char *safe_search = hex_escape(search, FALSE); char *safe_search_term = hex_escape(search_term, FALSE); g_debug("original search term: \"%s\"", safe_search_term); g_debug("canonical search term: \"%s\"", safe_search); if (safe_search != search) HFREE_NULL(safe_search); if (safe_search_term != search_term) HFREE_NULL(safe_search_term); } len = strlen(search); /* * Find smallest bin */ if (len >= 2) { for (i = 0; i < len - 1; i++) { struct st_bin *bin; if (is_ascii_space(search[i]) || is_ascii_space(search[i+1])) continue; key = st_key(table, search + i); if ((bin = table->bins[key]) == NULL) { best_bin = NULL; break; } if (bin->nvals < best_bin_size) { best_bin = bin; best_bin_size = bin->nvals; } } if (GNET_PROPERTY(matching_debug) > 4) g_debug("MATCH st_search(): str=\"%s\", len=%d, best_bin_size=%d", lazy_safe_search(search_term), len, best_bin_size); } /* * If the best_bin is NULL, we did not find a matching bin, and we're * sure we won't be able to find the search string. * * Note that on search strings like "r e m ", we always have a letter * followed by spaces, so we won't search that. * --RAM, 06/10/2001 */ if (best_bin == NULL) { /* * If we have a `qhv', we need to compute the word vector anway, * for query routing... */ if (qhv == NULL) goto finish; } /* * Prepare matching patterns */ wocnt = word_vec_make(search, &wovec); /* * Compute the query hashing information for query routing, if needed. */ if (qhv != NULL) { for (i = 0; i < wocnt; i++) { if (wovec[i].len >= QRP_MIN_WORD_LENGTH) qhvec_add(qhv, wovec[i].word, QUERY_H_WORD); } } if (wocnt == 0 || best_bin == NULL) { if (wocnt > 0) word_vec_free(wovec, wocnt); goto finish; } g_assert(best_bin_size > 0); /* Allocated bin, it must hold something */ pattern = walloc0(wocnt * sizeof *pattern); /* * Prepare matching optimization, an idea from Mike Green. * * At library building time, we computed a mask hash, made from the * lowercased file name, using one bit per different letter, roughly * (see mask_hash() for the exact algorigthm). * * We're now going to compute the same mask on the query, and compare * it bitwise with the mask for each file. If the file does not hold * at least all the chars present in the query, it's no use applying * the pattern matching algorithm, it won't match at all. * * --RAM, 01/10/2001 */ search_mask = mask_hash(search); /* * Prepare second matching optimization: since all words in the query * must match the exact amount of time, we can compute the minimum length * the searched file must have. We add one character after each word * but the last, to account for space between words. * --RAM, 11/07/2002 */ for (minlen = 0, i = 0; i < wocnt; i++) minlen += wovec[i].len + 1; minlen--; g_assert(minlen <= INT_MAX); /* * Search through the smallest bin */ vcnt = best_bin->nvals; vals = best_bin->vals; random_offset = random_u32() % vcnt; nres = 0; for (i = 0; i < vcnt; i++) { const struct st_entry *e; shared_file_t *sf; size_t canonic_len; /* * As we only return a limited count of results, pick a random * offset, so that repeated searches will match different items * instead of always the first - with some probability. */ e = vals[(i + random_offset) % vcnt]; if ((e->mask & search_mask) != search_mask) continue; /* Can't match */ sf = e->sf; canonic_len = shared_file_name_canonic_len(sf); if (canonic_len < minlen) continue; /* Can't match */ scanned++; if (entry_match(e->string, canonic_len, pattern, wovec, wocnt)) { if (GNET_PROPERTY(matching_debug) > 4) { g_debug("MATCH \"%s\" matches %s", search, shared_file_name_nfc(sf)); } if ((*callback)(ctx, sf)) { nres++; if (nres >= max_res) break; } } } if (GNET_PROPERTY(matching_debug) > 3) g_debug("MATCH st_search(): scanned %d entr%s from the %d in bin, " "got %d match%s", scanned, 1 == scanned ? "y" : "ies", best_bin_size, nres, 1 == nres ? "" : "es"); for (i = 0; i < wocnt; i++) if (pattern[i]) /* Lazily compiled by entry_match() */ pattern_free(pattern[i]); wfree(pattern, wocnt * sizeof *pattern); word_vec_free(wovec, wocnt); finish: if (search != search_term) { HFREE_NULL(search); } return nres; }