コード例 #1
0
void search_buf(const char *buf, const int buf_len,
                const char *dir_full_path) {
    int binary = -1;  /* 1 = yes, 0 = no, -1 = don't know */
    int buf_offset = 0;

    if (opts.search_stream) {
        binary = 0;
    } else if (!opts.search_binary_files) {
        binary = is_binary((void*) buf, buf_len);
        if (binary) {
            log_debug("File %s is binary. Skipping...", dir_full_path);
            return;
        }
    }

    int matches_len = 0;
    match *matches;
    size_t matches_size;
    size_t matches_spare;

    if (opts.invert_match) {
        /* If we are going to invert the set of matches at the end, we will need
         * one extra match struct, even if there are no matches at all. So make
         * sure we have a nonempty array; and make sure we always have spare
         * capacity for one extra.
         */
        matches_size = 100;
        matches = ag_malloc(matches_size * sizeof(match));
        matches_spare = 1;
    } else {
        matches_size = 0;
        matches = NULL;
        matches_spare = 0;
    }

    if (opts.query_len == 1 && opts.query[0] == '.') {
        matches_size = 1;
        matches = ag_malloc(matches_size * sizeof(match));
        matches[0].start = 0;
        matches[0].end = buf_len;
        matches_len = 1;
    } else if (opts.literal) {
        const char *match_ptr = buf;
        strncmp_fp ag_strnstr_fp = get_strstr(opts);

        while (buf_offset < buf_len) {
            match_ptr = ag_strnstr_fp(match_ptr, opts.query, buf_len - buf_offset, opts.query_len, skip_lookup);
            if (match_ptr == NULL) {
                break;
            }

            if (opts.word_regexp) {
                const char *start = match_ptr;
                const char *end = match_ptr + opts.query_len;

                /* Check whether both start and end of the match lie on a word
                 * boundary
                 */
                if ((start == buf ||
                     is_wordchar(*(start - 1)) != opts.literal_starts_wordchar)
                    &&
                    (end == buf + buf_len ||
                     is_wordchar(*end) != opts.literal_ends_wordchar))
                {
                    /* It's a match */
                } else {
                    /* It's not a match */
                    match_ptr += opts.query_len;
                    buf_offset = end - buf;
                    continue;
                }
            }

            if ((size_t)matches_len + matches_spare >= matches_size) {
                matches_size = matches ? matches_size * 2 : 100;
                log_debug("Too many matches in %s. Reallocating matches to %zu.", dir_full_path, matches_size);
                matches = ag_realloc(matches, matches_size * sizeof(match));
            }

            matches[matches_len].start = match_ptr - buf;
            matches[matches_len].end = matches[matches_len].start + opts.query_len;
            buf_offset = matches[matches_len].end;
            log_debug("Match found. File %s, offset %i bytes.", dir_full_path, matches[matches_len].start);
            matches_len++;
            match_ptr += opts.query_len;

            if (matches_len >= opts.max_matches_per_file) {
                log_err("Too many matches in %s. Skipping the rest of this file.", dir_full_path);
                break;
            }
        }
    } else {
        int rc;
        int offset_vector[3];
        while (buf_offset < buf_len &&
              (rc = pcre_exec(opts.re, opts.re_extra, buf, buf_len, buf_offset, 0, offset_vector, 3)) >= 0) {
            log_debug("Regex match found. File %s, offset %i bytes.", dir_full_path, offset_vector[0]);
            buf_offset = offset_vector[1];

            /* TODO: copy-pasted from above. FIXME */
            if ((size_t)matches_len + matches_spare >= matches_size) {
                matches_size = matches ? matches_size * 2 : 100;
                log_debug("Too many matches in %s. Reallocating matches to %zu.", dir_full_path, matches_size);
                matches = ag_realloc(matches, matches_size * sizeof(match));
            }

            matches[matches_len].start = offset_vector[0];
            matches[matches_len].end = offset_vector[1];
            matches_len++;

            if (matches_len >= opts.max_matches_per_file) {
                log_err("Too many matches in %s. Skipping the rest of this file.", dir_full_path);
                break;
            }
        }
    }

    if (opts.invert_match) {
        matches_len = invert_matches(matches, matches_len, buf_len);
    }

    if (opts.stats) {
        pthread_mutex_lock(&stats_mtx);
        stats.total_bytes += buf_len;
        stats.total_files++;
        stats.total_matches += matches_len;
        pthread_mutex_unlock(&stats_mtx);
    }

    if (matches_len > 0) {
        if (binary == -1 && !opts.print_filename_only) {
            binary = is_binary((void*) buf, buf_len);
        }
        pthread_mutex_lock(&print_mtx);
        if (opts.print_filename_only) {
            print_path(dir_full_path, '\n');
        } else if (binary) {
            print_binary_file_matches(dir_full_path);
        } else {
            print_file_matches(dir_full_path, buf, buf_len, matches, matches_len);
        }
        pthread_mutex_unlock(&print_mtx);
    } else {
        log_debug("No match in %s", dir_full_path);
    }

    if (matches_size > 0) {
        free(matches);
    }
}
コード例 #2
0
ファイル: main.c プロジェクト: TidyHuang/filescan
int wmain(int argc, wchar_t **argv) {
	wchar_t **base_paths = NULL;
	wchar_t **paths = NULL;
    int i;
    int pcre_opts = PCRE_MULTILINE;
    int study_opts = 0;
    double time_diff;
    worker_t *workers = NULL;
    int workers_len;
    int num_cores;

#ifdef KJK_BUILD
    extern void setup_crash_handler(); /* in kjk_crash_handler.cpp */
    setup_crash_handler();
#endif

    set_log_level(LOG_LEVEL_WARN);

    work_queue = NULL;
    work_queue_tail = NULL;
    memset(&stats, 0, sizeof(stats));
    root_ignores = init_ignore(NULL, L"", 0);
    out_fd = stdout;
#ifdef USE_PCRE_JIT
    int has_jit = 0;
    pcre_config(PCRE_CONFIG_JIT, &has_jit);
    if (has_jit) {
        study_opts |= PCRE_STUDY_JIT_COMPILE;
    }
#endif

    gettimeofday(&(stats.time_start), NULL);

    parse_options(argc, argv, &base_paths, &paths);
	log_debug(L"PCRE Version: %s", pcre16_version());
	setlocale(LC_ALL, "chs");

#ifdef _WIN32
    {
        SYSTEM_INFO si;
        GetSystemInfo(&si);
        num_cores = si.dwNumberOfProcessors;
    }
#else
    num_cores = (int)sysconf(_SC_NPROCESSORS_ONLN);
#endif

    workers_len = num_cores;
    if (opts.literal) {
        workers_len--;
    }
    if (opts.workers) {
        workers_len = opts.workers;
    }
    if (workers_len < 1) {
        workers_len = 1;
    }

    log_debug(L"Using %i workers", workers_len);
    done_adding_files = FALSE;
    workers = (worker_t *) ag_calloc(workers_len, sizeof(worker_t));
	if (pthread_cond_init(&files_ready, NULL)) {
        die(L"pthread_cond_init failed!");
    }
    if (pthread_mutex_init(&print_mtx, NULL)) {
        die(L"pthread_mutex_init failed!");
    }
    if (pthread_mutex_init(&stats_mtx, NULL)) {
        die(L"pthread_mutex_init failed!");
    }
    if (pthread_mutex_init(&work_queue_mtx, NULL)) {
        die(L"pthread_mutex_init failed!");
    }

    if (opts.casing == CASE_SMART) {
        opts.casing = is_lowercase(opts.query) ? CASE_INSENSITIVE : CASE_SENSITIVE;
    }

    if (opts.literal) {
        if (opts.casing == CASE_INSENSITIVE) {
            /* Search routine needs the query to be lowercase */
            wchar_t *c = opts.query;
            for (; *c != '\0'; ++c) {
				*c = (wchar_t)towlower(*c);
            }
        }
        generate_alpha_skip(opts.query, opts.query_len, alpha_skip_lookup, opts.casing == CASE_SENSITIVE);
        find_skip_lookup = NULL;
        generate_find_skip(opts.query, opts.query_len, &find_skip_lookup, opts.casing == CASE_SENSITIVE);
        if (opts.word_regexp) {
            init_wordchar_table();
            opts.literal_starts_wordchar = is_wordchar(opts.query[0]);
            opts.literal_ends_wordchar = is_wordchar(opts.query[opts.query_len - 1]);
        }
    } else {
        if (opts.casing == CASE_INSENSITIVE) {
            pcre_opts |= PCRE_CASELESS;
        }
        if (opts.word_regexp) {
            wchar_t *word_regexp_query;
            ag_asprintf(&word_regexp_query, L"\\b%s\\b", opts.query);
            free(opts.query);
            opts.query = word_regexp_query;
            opts.query_len = wcslen(opts.query);
        }
        compile_study(&opts.re, &opts.re_extra, opts.query, pcre_opts, study_opts);
    }

    if (opts.search_stream) {
        search_stream(stdin, L"");
    } else {
        for (i = 0; i < workers_len; i++) {
            workers[i].id = i;
            int rv = pthread_create(&(workers[i].thread), NULL, &search_file_worker, &(workers[i].id));
            if (rv != 0) {
                die(L"error in pthread_create(): %s", strerror(rv));
            }
#if defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(USE_CPU_SET)
            if (opts.use_thread_affinity) {
                cpu_set_t cpu_set;
                CPU_ZERO(&cpu_set);
                CPU_SET(i % num_cores, &cpu_set);
                rv = pthread_setaffinity_np(workers[i].thread, sizeof(cpu_set), &cpu_set);
                if (rv != 0) {
                    die("error in pthread_setaffinity_np(): %s", strerror(rv));
                }
                log_debug("Thread %i set to CPU %i", i, i);
            } else {
                log_debug("Thread affinity disabled.");
            }
#else
            log_debug(L"No CPU affinity support.");
#endif
        }
        for (i = 0; paths[i] != NULL; i++) {
            log_debug(L"searching path %s for %s", paths[i], opts.query);
            symhash = NULL;
            ignores *ig = init_ignore(root_ignores, L"", 0);
            struct stat s;
            s.st_dev = 0;
#ifndef _WIN32
            /* The device is ignored if opts.one_dev is false, so it's fine
             * to leave it at the default 0
             */
            if (opts.one_dev && lstat(paths[i], &s) == -1) {
                log_err("Failed to get device information for path %s. Skipping...", paths[i]);
            }
#endif
            search_dir(ig, base_paths[i], paths[i], 0, s.st_dev);
            cleanup_ignore(ig);
        }
        pthread_mutex_lock(&work_queue_mtx);
        done_adding_files = TRUE;
        pthread_cond_broadcast(&files_ready);
        pthread_mutex_unlock(&work_queue_mtx);
        for (i = 0; i < workers_len; i++) {
            if (pthread_join(workers[i].thread, NULL)) {
                die(L"pthread_join failed!");
            }
        }
    }

    if (opts.stats) {
        gettimeofday(&(stats.time_end), NULL);
        time_diff = ((long)stats.time_end.tv_sec * 1000000 + stats.time_end.tv_usec) -
                    ((long)stats.time_start.tv_sec * 1000000 + stats.time_start.tv_usec);
        time_diff /= 1000000;

        wprintf(L"%ld matches\n%ld files searched\n%ld bytes searched\n%f seconds\n", stats.total_matches, stats.total_files, stats.total_bytes, time_diff);
    }

    if (opts.pager) {
        pclose(out_fd);
    }
    cleanup_options();
    pthread_cond_destroy(&files_ready);
    pthread_mutex_destroy(&work_queue_mtx);
    pthread_mutex_destroy(&stats_mtx);
    pthread_mutex_destroy(&print_mtx);
    cleanup_ignore(root_ignores);
    free(workers);
    for (i = 0; paths[i] != NULL; i++) {
        free(paths[i]);
        free(base_paths[i]);
    }
    free(base_paths);
    free(paths);
    if (find_skip_lookup) {
        free(find_skip_lookup);
    }
    return !opts.match_found;
}
コード例 #3
0
int main(int argc, char **argv) {
    char **base_paths = NULL;
    char **paths = NULL;
    int i;
    int pcre_opts = PCRE_MULTILINE;
    int study_opts = 0;
    double time_diff;
    pthread_t *workers = NULL;
    int workers_len;

    set_log_level(LOG_LEVEL_WARN);

    work_queue = NULL;
    work_queue_tail = NULL;
    memset(&stats, 0, sizeof(stats));
    root_ignores = init_ignore(NULL, "", 0);
    out_fd = stdout;
#ifdef USE_PCRE_JIT
    int has_jit = 0;
    pcre_config(PCRE_CONFIG_JIT, &has_jit);
    if (has_jit) {
        study_opts |= PCRE_STUDY_JIT_COMPILE;
    }
#endif

    gettimeofday(&(stats.time_start), NULL);

    parse_options(argc, argv, &base_paths, &paths);
    log_debug("PCRE Version: %s", pcre_version());

#ifdef _WIN32
    {
        SYSTEM_INFO si;
        GetSystemInfo(&si);
        workers_len = si.dwNumberOfProcessors;
    }
#else
    workers_len = (int)sysconf(_SC_NPROCESSORS_ONLN);
#endif
    if (opts.literal) {
        workers_len--;
    }
    if (opts.workers) {
        workers_len = opts.workers;
    }
    if (workers_len < 1) {
        workers_len = 1;
    }

    log_debug("Using %i workers", workers_len);
    done_adding_files = FALSE;
    workers = ag_calloc(workers_len, sizeof(pthread_t));
    if (pthread_cond_init(&files_ready, NULL)) {
        die("pthread_cond_init failed!");
    }
    if (pthread_mutex_init(&print_mtx, NULL)) {
        die("pthread_mutex_init failed!");
    }
    if (pthread_mutex_init(&stats_mtx, NULL)) {
        die("pthread_mutex_init failed!");
    }
    if (pthread_mutex_init(&work_queue_mtx, NULL)) {
        die("pthread_mutex_init failed!");
    }

    if (opts.casing == CASE_SMART) {
        opts.casing = is_lowercase(opts.query) ? CASE_INSENSITIVE : CASE_SENSITIVE;
    }

    if (opts.literal) {
        if (opts.casing == CASE_INSENSITIVE) {
            /* Search routine needs the query to be lowercase */
            char *c = opts.query;
            for (; *c != '\0'; ++c) {
                *c = (char)tolower(*c);
            }
        }
        generate_alpha_skip(opts.query, opts.query_len, alpha_skip_lookup, opts.casing == CASE_SENSITIVE);
        find_skip_lookup = NULL;
        generate_find_skip(opts.query, opts.query_len, &find_skip_lookup, opts.casing == CASE_SENSITIVE);
        if (opts.word_regexp) {
            init_wordchar_table();
            opts.literal_starts_wordchar = is_wordchar(opts.query[0]);
            opts.literal_ends_wordchar = is_wordchar(opts.query[opts.query_len - 1]);
        }
    } else {
        if (opts.casing == CASE_INSENSITIVE) {
            pcre_opts |= PCRE_CASELESS;
        }
        if (opts.word_regexp) {
            char *word_regexp_query;
            ag_asprintf(&word_regexp_query, "\\b%s\\b", opts.query);
            free(opts.query);
            opts.query = word_regexp_query;
            opts.query_len = strlen(opts.query);
        }
        compile_study(&opts.re, &opts.re_extra, opts.query, pcre_opts, study_opts);
    }

    if (opts.search_stream) {
        search_stream(stdin, "");
    } else {
        for (i = 0; i < workers_len; i++) {
            int rv = pthread_create(&(workers[i]), NULL, &search_file_worker, &i);
            if (rv != 0) {
                die("error in pthread_create(): %s", strerror(rv));
            }
        }
        for (i = 0; paths[i] != NULL; i++) {
            log_debug("searching path %s for %s", paths[i], opts.query);
            symhash = NULL;
            ignores *ig = init_ignore(root_ignores, "", 0);
            search_dir(ig, base_paths[i], paths[i], 0);
            cleanup_ignore(ig);
        }
        pthread_mutex_lock(&work_queue_mtx);
        done_adding_files = TRUE;
        pthread_cond_broadcast(&files_ready);
        pthread_mutex_unlock(&work_queue_mtx);
        for (i = 0; i < workers_len; i++) {
            if (pthread_join(workers[i], NULL)) {
                die("pthread_join failed!");
            }
        }
    }

    if (opts.stats) {
        gettimeofday(&(stats.time_end), NULL);
        time_diff = ((long)stats.time_end.tv_sec * 1000000 + stats.time_end.tv_usec) -
                    ((long)stats.time_start.tv_sec * 1000000 + stats.time_start.tv_usec);
        time_diff /= 1000000;

        printf("%ld matches\n%ld files searched\n%ld bytes searched\n%f seconds\n", stats.total_matches, stats.total_files, stats.total_bytes, time_diff);
    }

    if (opts.pager) {
        pclose(out_fd);
    }
    cleanup_options();
    pthread_cond_destroy(&files_ready);
    pthread_mutex_destroy(&work_queue_mtx);
    pthread_mutex_destroy(&stats_mtx);
    pthread_mutex_destroy(&print_mtx);
    cleanup_ignore(root_ignores);
    free(workers);
    for (i = 0; paths[i] != NULL; i++) {
        free(paths[i]);
        free(base_paths[i]);
    }
    free(base_paths);
    free(paths);
    if (find_skip_lookup) {
        free(find_skip_lookup);
    }
    return !opts.match_found;
}
コード例 #4
0
ファイル: searcher.c プロジェクト: TidyHuang/DLP
void search_buf(char *buf, size_t buf_len,
	const char *dir_full_path, char *tmp_file_path) {
	int binary = -1; /* 1 = yes, 0 = no, -1 = don't know */
	size_t buf_offset = 0;

	//if (opts.search_stream) {
	//	binary = 0;
	//}
	//else if (!opts.search_binary_files) {
	//	binary = is_binary(buf, buf_len);

	//	if (binary) {
	//		log_debug("File %s is binary. Skipping...", dir_full_path);
	//		if (!convert_to_text(&buf, &buf_len, dir_full_path, tmp_file_path))
	//			return;
	//	}
	//}

	binary = is_binary(buf, buf_len);

	if (binary) {
		log_debug("File %s is binary. Convert to text...", dir_full_path);
		if (!convert_to_text(&buf, &buf_len, dir_full_path, tmp_file_path))
			return;
	}
	size_t matches_len = 0;
	match_t *matches;
	size_t matches_size;
	size_t matches_spare;

	//if (opts.invert_match) {
	//	/* If we are going to invert the set of matches at the end, we will need
	//	* one extra match struct, even if there are no matches at all. So make
	//	* sure we have a nonempty array; and make sure we always have spare
	//	* capacity for one extra.
	//	*/
	//	matches_size = 100;
	//	matches = (match_t *)ag_malloc(matches_size * sizeof(match_t));
	//	matches_spare = 1;
	//}
	//else {
	//	matches_size = 0;
	//	matches = NULL;
	//	matches_spare = 0;
	//}
	matches_size = 0;
	matches = NULL;
	matches_spare = 0;

	if (/*!opts.literal && */opts.query_len == 1 && opts.query[0] == '.') {
		matches_size = 1;
		matches = (match_t *)ag_malloc(matches_size * sizeof(match_t));
		matches[0].start = 0;
		matches[0].end = buf_len;
		matches_len = 1;
	}
	else {
		const char *match_ptr = buf;
		strncmp_fp ag_strnstr_fp = get_strstr(opts.casing);

		while (buf_offset < buf_len) {
			match_ptr = ag_strnstr_fp(match_ptr, opts.query, buf_len - buf_offset, opts.query_len, alpha_skip_lookup, find_skip_lookup);
			if (match_ptr == NULL) {
				break;
			}

			if (opts.word_regexp) {
				const char *start = match_ptr;
				const char *end = match_ptr + opts.query_len;

				/* Check whether both start and end of the match lie on a word
				* boundary
				*/
				if ((start == buf ||
					is_wordchar(*(start - 1)) != opts.literal_starts_wordchar) &&
					(end == buf + buf_len ||
					is_wordchar(*end) != opts.literal_ends_wordchar)) {
					/* It's a match */
				}
				else {
					/* It's not a match */
					match_ptr += opts.query_len;
					buf_offset = end - buf;
					continue;
				}
			}

			if (matches_len + matches_spare >= matches_size) {
				/* TODO: benchmark initial size of matches. 100 may be too small/big */
				matches_size = matches ? matches_size * 2 : 100;
				log_debug("Too many matches in %s. Reallocating matches to %zu.", dir_full_path, matches_size);
				matches = (match_t *)ag_realloc(matches, matches_size * sizeof(match_t));
			}

			matches[matches_len].start = match_ptr - buf;
			matches[matches_len].end = matches[matches_len].start + opts.query_len;
			buf_offset = matches[matches_len].end;
			log_debug("Match found. File %s, offset %lu bytes.", dir_full_path, matches[matches_len].start);
			matches_len++;
			match_ptr += opts.query_len;

			if (opts.max_matches_per_file > 0 && matches_len >= opts.max_matches_per_file) {
				log_err("Too many matches in %s. Skipping the rest of this file.", dir_full_path);
				break;
			}
		}
	}
	else {