コード例 #1
0
ファイル: matching.c プロジェクト: Haxe/gtk-gnutella
/**
 * Do an actual search.
 *
 * @param table			table containing organized entries to search from
 * @param search_term	the query string
 * @param callback		routine to invoke for each match
 * @param ctx			user-supplied data to pass on to callback
 * @param max_res		maximum amount of results to return
 * @param qhv			query hash vector built from query string, for routing
 *
 * @return number of hits we produced
 */
G_GNUC_HOT int
st_search(
	search_table_t *table,
	const char *search_term,
	st_search_callback callback,
	gpointer ctx,
	int max_res,
	query_hashvec_t *qhv)
{
	char *search;
	int key, nres = 0;
	guint i, len;
	struct st_bin *best_bin = NULL;
	int best_bin_size = INT_MAX;
	word_vec_t *wovec;
	guint wocnt;
	cpattern_t **pattern;
	struct st_entry **vals;
	guint vcnt;
	int scanned = 0;		/* measure search mask efficiency */
	guint32 search_mask;
	size_t minlen;
	guint random_offset;  /* Randomizer for search returns */

	search = UNICODE_CANONIZE(search_term);

	if (GNET_PROPERTY(query_debug) > 4 && 0 != strcmp(search, search_term)) {
		char *safe_search = hex_escape(search, FALSE);
		char *safe_search_term = hex_escape(search_term, FALSE);
		g_debug("original search term: \"%s\"", safe_search_term);
		g_debug("canonical search term: \"%s\"", safe_search);
		if (safe_search != search)
			HFREE_NULL(safe_search);
		if (safe_search_term != search_term)
			HFREE_NULL(safe_search_term);
	}
	len = strlen(search);

	/*
	 * Find smallest bin
	 */

	if (len >= 2) {
		for (i = 0; i < len - 1; i++) {
			struct st_bin *bin;
			if (is_ascii_space(search[i]) || is_ascii_space(search[i+1]))
				continue;
			key = st_key(table, search + i);
			if ((bin = table->bins[key]) == NULL) {
				best_bin = NULL;
				break;
			}
			if (bin->nvals < best_bin_size) {
				best_bin = bin;
				best_bin_size = bin->nvals;
			}
		}

		if (GNET_PROPERTY(matching_debug) > 4)
			g_debug("MATCH st_search(): str=\"%s\", len=%d, best_bin_size=%d",
				lazy_safe_search(search_term), len, best_bin_size);
	}

	/*
	 * If the best_bin is NULL, we did not find a matching bin, and we're
	 * sure we won't be able to find the search string.
	 *
	 * Note that on search strings like "r e m ", we always have a letter
	 * followed by spaces, so we won't search that.
	 *		--RAM, 06/10/2001
	 */

	if (best_bin == NULL) {
		/*
		 * If we have a `qhv', we need to compute the word vector anway,
		 * for query routing...
		 */

		if (qhv == NULL)
			goto finish;
	}

	/*
	 * Prepare matching patterns
	 */

	wocnt = word_vec_make(search, &wovec);

	/*
	 * Compute the query hashing information for query routing, if needed.
	 */

	if (qhv != NULL) {
		for (i = 0; i < wocnt; i++) {
			if (wovec[i].len >= QRP_MIN_WORD_LENGTH)
				qhvec_add(qhv, wovec[i].word, QUERY_H_WORD);
		}
	}

	if (wocnt == 0 || best_bin == NULL) {
		if (wocnt > 0)
			word_vec_free(wovec, wocnt);
		goto finish;
	}

	g_assert(best_bin_size > 0);	/* Allocated bin, it must hold something */


	pattern = walloc0(wocnt * sizeof *pattern);

	/*
	 * Prepare matching optimization, an idea from Mike Green.
	 *
	 * At library building time, we computed a mask hash, made from the
	 * lowercased file name, using one bit per different letter, roughly
	 * (see mask_hash() for the exact algorigthm).
	 *
	 * We're now going to compute the same mask on the query, and compare
	 * it bitwise with the mask for each file.  If the file does not hold
	 * at least all the chars present in the query, it's no use applying
	 * the pattern matching algorithm, it won't match at all.
	 *
	 *		--RAM, 01/10/2001
	 */

	search_mask = mask_hash(search);

	/*
	 * Prepare second matching optimization: since all words in the query
	 * must match the exact amount of time, we can compute the minimum length
	 * the searched file must have.  We add one character after each word
	 * but the last, to account for space between words.
	 *		--RAM, 11/07/2002
	 */

	for (minlen = 0, i = 0; i < wocnt; i++)
		minlen += wovec[i].len + 1;
	minlen--;
	g_assert(minlen <= INT_MAX);

	/*
	 * Search through the smallest bin
	 */

	vcnt = best_bin->nvals;
	vals = best_bin->vals;
	random_offset = random_u32() % vcnt;

	nres = 0;
	for (i = 0; i < vcnt; i++) {
		const struct st_entry *e;
		shared_file_t *sf;
		size_t canonic_len;

		/*
		 * As we only return a limited count of results, pick a random
		 * offset, so that repeated searches will match different items
		 * instead of always the first - with some probability.
		 */
		e = vals[(i + random_offset) % vcnt];
		
		if ((e->mask & search_mask) != search_mask)
			continue;		/* Can't match */

		sf = e->sf;

		canonic_len = shared_file_name_canonic_len(sf);
		if (canonic_len < minlen)
			continue;		/* Can't match */

		scanned++;

		if (entry_match(e->string, canonic_len, pattern, wovec, wocnt)) {
			if (GNET_PROPERTY(matching_debug) > 4) {
				g_debug("MATCH \"%s\" matches %s",
					search, shared_file_name_nfc(sf));
			}

			if ((*callback)(ctx, sf)) {
				nres++;
				if (nres >= max_res)
					break;
			}
		}
	}

	if (GNET_PROPERTY(matching_debug) > 3)
		g_debug("MATCH st_search(): scanned %d entr%s from the %d in bin, "
			"got %d match%s",
			scanned, 1 == scanned ? "y" : "ies",
			best_bin_size, nres, 1 == nres ? "" : "es");

	for (i = 0; i < wocnt; i++)
		if (pattern[i])					/* Lazily compiled by entry_match() */
			pattern_free(pattern[i]);

	wfree(pattern, wocnt * sizeof *pattern);
	word_vec_free(wovec, wocnt);

finish:
	if (search != search_term) {
		HFREE_NULL(search);
	}

	return nres;
}
コード例 #2
0
ファイル: search.c プロジェクト: megabajt/poldek
static int search(struct cmdctx *cmdctx)
{
    struct poclidek_ctx   *cctx = NULL;
    tn_array               *pkgs = NULL;
    tn_array               *matched_pkgs = NULL;
    int                    i, err = 0, display_bar = 0, bar_v;
    int                    term_height;
    struct pattern         *pt;
    unsigned               flags;
    
    if ((pt = cmdctx->_data) == NULL) {
        logn(LOGERR, _("search: no pattern given"));
        err++;
        goto l_end;
    }
    cmdctx->_data = NULL;            /* we'll free pattern myself */

    cctx = cmdctx->cctx;
    
    flags = cmdctx->_flags;
    flags &= ~OPT_NO_SEARCHSW;
    if (flags == 0)
        cmdctx->_flags |= OPT_SEARCH_DEFAULT;
    
    init_pcre();
    if (!pattern_compile(pt, poldek_ts_get_arg_count(cmdctx->ts))) {
        err++;
        goto l_end;
    }

    poclidek_load_packages(cmdctx->cctx, POCLIDEK_LOAD_ALL);
    if (poldek_ts_get_arg_count(cmdctx->ts) == 0) {
        pkgs = poclidek_get_dent_packages(cctx, NULL);
        
    } else {
        pkgs = poclidek_resolve_packages(NULL, cctx, cmdctx->ts, 0);
    }
    
    if (pkgs == NULL)
        return 0;
    
    matched_pkgs = n_array_new(32, NULL, NULL);

    if (n_array_size(pkgs) > 5 && (cmdctx->_flags & OPT_SEARCH_HDD)) {
        display_bar = 1;
        msg(0, _("Searching packages..."));
    }
    bar_v = 0;
    
    for (i=0; i < n_array_size(pkgs); i++) {
        struct pkg *pkg = n_array_nth(pkgs, i);
        
        if (pkg_match(pkg, pt, cmdctx->_flags)) 
            n_array_push(matched_pkgs, pkg);
        
        if (display_bar) {
            int v, j;
            
            v = i * 40 / n_array_size(pkgs);
            for (j = bar_v; j < v; j++)
                msg(0, "_.");
            bar_v = v;
        }
        
        if (sigint_reached()) {
            msgn(0, _("_interrupted."));
            goto l_end;
        }
    }
    
    if (display_bar) 
        msgn(0, _("_done."));

    term_height = poldek_term_get_height();
    if (n_array_size(matched_pkgs) == 0) 
        cmdctx_printf_c(cmdctx, PRCOLOR_YELLOW, "!No package matches '%s'\n",
                        pt->regexp);
    
    else if (n_array_size(matched_pkgs) < term_height)
        cmdctx_printf_c(cmdctx, PRCOLOR_YELLOW, "!%d package(s) found:\n",
                        n_array_size(matched_pkgs));
    
        
    for (i=0; i<n_array_size(matched_pkgs); i++) {
        struct pkg *pkg;

        pkg = n_array_nth(matched_pkgs, i);
        cmdctx_addtoresult(cmdctx, pkg);
        cmdctx_printf(cmdctx, "%s\n", pkg_id(pkg));
    }

    if (n_array_size(matched_pkgs) >= term_height)
        cmdctx_printf_c(cmdctx, PRCOLOR_YELLOW, "!%d package(s) found.\n",
                        n_array_size(matched_pkgs));
        
l_end:

    if (pkgs)
        n_array_free(pkgs);
    
    if (matched_pkgs)
        n_array_free(matched_pkgs);
    
    if (cmdctx->_data)
        cmdctx->_data = NULL;
    
    if (pt)
        pattern_free(pt);
    return 1;
}