void search_buf(const char *buf, const int buf_len, const char *dir_full_path) { int binary = -1; /* 1 = yes, 0 = no, -1 = don't know */ int buf_offset = 0; if (opts.search_stream) { binary = 0; } else if (!opts.search_binary_files) { binary = is_binary((void*) buf, buf_len); if (binary) { log_debug("File %s is binary. Skipping...", dir_full_path); return; } } int matches_len = 0; match *matches; size_t matches_size; size_t matches_spare; if (opts.invert_match) { /* If we are going to invert the set of matches at the end, we will need * one extra match struct, even if there are no matches at all. So make * sure we have a nonempty array; and make sure we always have spare * capacity for one extra. */ matches_size = 100; matches = ag_malloc(matches_size * sizeof(match)); matches_spare = 1; } else { matches_size = 0; matches = NULL; matches_spare = 0; } if (opts.query_len == 1 && opts.query[0] == '.') { matches_size = 1; matches = ag_malloc(matches_size * sizeof(match)); matches[0].start = 0; matches[0].end = buf_len; matches_len = 1; } else if (opts.literal) { const char *match_ptr = buf; strncmp_fp ag_strnstr_fp = get_strstr(opts); while (buf_offset < buf_len) { match_ptr = ag_strnstr_fp(match_ptr, opts.query, buf_len - buf_offset, opts.query_len, skip_lookup); if (match_ptr == NULL) { break; } if (opts.word_regexp) { const char *start = match_ptr; const char *end = match_ptr + opts.query_len; /* Check whether both start and end of the match lie on a word * boundary */ if ((start == buf || is_wordchar(*(start - 1)) != opts.literal_starts_wordchar) && (end == buf + buf_len || is_wordchar(*end) != opts.literal_ends_wordchar)) { /* It's a match */ } else { /* It's not a match */ match_ptr += opts.query_len; buf_offset = end - buf; continue; } } if ((size_t)matches_len + matches_spare >= matches_size) { matches_size = matches ? matches_size * 2 : 100; log_debug("Too many matches in %s. Reallocating matches to %zu.", dir_full_path, matches_size); matches = ag_realloc(matches, matches_size * sizeof(match)); } matches[matches_len].start = match_ptr - buf; matches[matches_len].end = matches[matches_len].start + opts.query_len; buf_offset = matches[matches_len].end; log_debug("Match found. File %s, offset %i bytes.", dir_full_path, matches[matches_len].start); matches_len++; match_ptr += opts.query_len; if (matches_len >= opts.max_matches_per_file) { log_err("Too many matches in %s. Skipping the rest of this file.", dir_full_path); break; } } } else { int rc; int offset_vector[3]; while (buf_offset < buf_len && (rc = pcre_exec(opts.re, opts.re_extra, buf, buf_len, buf_offset, 0, offset_vector, 3)) >= 0) { log_debug("Regex match found. File %s, offset %i bytes.", dir_full_path, offset_vector[0]); buf_offset = offset_vector[1]; /* TODO: copy-pasted from above. FIXME */ if ((size_t)matches_len + matches_spare >= matches_size) { matches_size = matches ? matches_size * 2 : 100; log_debug("Too many matches in %s. Reallocating matches to %zu.", dir_full_path, matches_size); matches = ag_realloc(matches, matches_size * sizeof(match)); } matches[matches_len].start = offset_vector[0]; matches[matches_len].end = offset_vector[1]; matches_len++; if (matches_len >= opts.max_matches_per_file) { log_err("Too many matches in %s. Skipping the rest of this file.", dir_full_path); break; } } } if (opts.invert_match) { matches_len = invert_matches(matches, matches_len, buf_len); } if (opts.stats) { pthread_mutex_lock(&stats_mtx); stats.total_bytes += buf_len; stats.total_files++; stats.total_matches += matches_len; pthread_mutex_unlock(&stats_mtx); } if (matches_len > 0) { if (binary == -1 && !opts.print_filename_only) { binary = is_binary((void*) buf, buf_len); } pthread_mutex_lock(&print_mtx); if (opts.print_filename_only) { print_path(dir_full_path, '\n'); } else if (binary) { print_binary_file_matches(dir_full_path); } else { print_file_matches(dir_full_path, buf, buf_len, matches, matches_len); } pthread_mutex_unlock(&print_mtx); } else { log_debug("No match in %s", dir_full_path); } if (matches_size > 0) { free(matches); } }
int wmain(int argc, wchar_t **argv) { wchar_t **base_paths = NULL; wchar_t **paths = NULL; int i; int pcre_opts = PCRE_MULTILINE; int study_opts = 0; double time_diff; worker_t *workers = NULL; int workers_len; int num_cores; #ifdef KJK_BUILD extern void setup_crash_handler(); /* in kjk_crash_handler.cpp */ setup_crash_handler(); #endif set_log_level(LOG_LEVEL_WARN); work_queue = NULL; work_queue_tail = NULL; memset(&stats, 0, sizeof(stats)); root_ignores = init_ignore(NULL, L"", 0); out_fd = stdout; #ifdef USE_PCRE_JIT int has_jit = 0; pcre_config(PCRE_CONFIG_JIT, &has_jit); if (has_jit) { study_opts |= PCRE_STUDY_JIT_COMPILE; } #endif gettimeofday(&(stats.time_start), NULL); parse_options(argc, argv, &base_paths, &paths); log_debug(L"PCRE Version: %s", pcre16_version()); setlocale(LC_ALL, "chs"); #ifdef _WIN32 { SYSTEM_INFO si; GetSystemInfo(&si); num_cores = si.dwNumberOfProcessors; } #else num_cores = (int)sysconf(_SC_NPROCESSORS_ONLN); #endif workers_len = num_cores; if (opts.literal) { workers_len--; } if (opts.workers) { workers_len = opts.workers; } if (workers_len < 1) { workers_len = 1; } log_debug(L"Using %i workers", workers_len); done_adding_files = FALSE; workers = (worker_t *) ag_calloc(workers_len, sizeof(worker_t)); if (pthread_cond_init(&files_ready, NULL)) { die(L"pthread_cond_init failed!"); } if (pthread_mutex_init(&print_mtx, NULL)) { die(L"pthread_mutex_init failed!"); } if (pthread_mutex_init(&stats_mtx, NULL)) { die(L"pthread_mutex_init failed!"); } if (pthread_mutex_init(&work_queue_mtx, NULL)) { die(L"pthread_mutex_init failed!"); } if (opts.casing == CASE_SMART) { opts.casing = is_lowercase(opts.query) ? CASE_INSENSITIVE : CASE_SENSITIVE; } if (opts.literal) { if (opts.casing == CASE_INSENSITIVE) { /* Search routine needs the query to be lowercase */ wchar_t *c = opts.query; for (; *c != '\0'; ++c) { *c = (wchar_t)towlower(*c); } } generate_alpha_skip(opts.query, opts.query_len, alpha_skip_lookup, opts.casing == CASE_SENSITIVE); find_skip_lookup = NULL; generate_find_skip(opts.query, opts.query_len, &find_skip_lookup, opts.casing == CASE_SENSITIVE); if (opts.word_regexp) { init_wordchar_table(); opts.literal_starts_wordchar = is_wordchar(opts.query[0]); opts.literal_ends_wordchar = is_wordchar(opts.query[opts.query_len - 1]); } } else { if (opts.casing == CASE_INSENSITIVE) { pcre_opts |= PCRE_CASELESS; } if (opts.word_regexp) { wchar_t *word_regexp_query; ag_asprintf(&word_regexp_query, L"\\b%s\\b", opts.query); free(opts.query); opts.query = word_regexp_query; opts.query_len = wcslen(opts.query); } compile_study(&opts.re, &opts.re_extra, opts.query, pcre_opts, study_opts); } if (opts.search_stream) { search_stream(stdin, L""); } else { for (i = 0; i < workers_len; i++) { workers[i].id = i; int rv = pthread_create(&(workers[i].thread), NULL, &search_file_worker, &(workers[i].id)); if (rv != 0) { die(L"error in pthread_create(): %s", strerror(rv)); } #if defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(USE_CPU_SET) if (opts.use_thread_affinity) { cpu_set_t cpu_set; CPU_ZERO(&cpu_set); CPU_SET(i % num_cores, &cpu_set); rv = pthread_setaffinity_np(workers[i].thread, sizeof(cpu_set), &cpu_set); if (rv != 0) { die("error in pthread_setaffinity_np(): %s", strerror(rv)); } log_debug("Thread %i set to CPU %i", i, i); } else { log_debug("Thread affinity disabled."); } #else log_debug(L"No CPU affinity support."); #endif } for (i = 0; paths[i] != NULL; i++) { log_debug(L"searching path %s for %s", paths[i], opts.query); symhash = NULL; ignores *ig = init_ignore(root_ignores, L"", 0); struct stat s; s.st_dev = 0; #ifndef _WIN32 /* The device is ignored if opts.one_dev is false, so it's fine * to leave it at the default 0 */ if (opts.one_dev && lstat(paths[i], &s) == -1) { log_err("Failed to get device information for path %s. Skipping...", paths[i]); } #endif search_dir(ig, base_paths[i], paths[i], 0, s.st_dev); cleanup_ignore(ig); } pthread_mutex_lock(&work_queue_mtx); done_adding_files = TRUE; pthread_cond_broadcast(&files_ready); pthread_mutex_unlock(&work_queue_mtx); for (i = 0; i < workers_len; i++) { if (pthread_join(workers[i].thread, NULL)) { die(L"pthread_join failed!"); } } } if (opts.stats) { gettimeofday(&(stats.time_end), NULL); time_diff = ((long)stats.time_end.tv_sec * 1000000 + stats.time_end.tv_usec) - ((long)stats.time_start.tv_sec * 1000000 + stats.time_start.tv_usec); time_diff /= 1000000; wprintf(L"%ld matches\n%ld files searched\n%ld bytes searched\n%f seconds\n", stats.total_matches, stats.total_files, stats.total_bytes, time_diff); } if (opts.pager) { pclose(out_fd); } cleanup_options(); pthread_cond_destroy(&files_ready); pthread_mutex_destroy(&work_queue_mtx); pthread_mutex_destroy(&stats_mtx); pthread_mutex_destroy(&print_mtx); cleanup_ignore(root_ignores); free(workers); for (i = 0; paths[i] != NULL; i++) { free(paths[i]); free(base_paths[i]); } free(base_paths); free(paths); if (find_skip_lookup) { free(find_skip_lookup); } return !opts.match_found; }
int main(int argc, char **argv) { char **base_paths = NULL; char **paths = NULL; int i; int pcre_opts = PCRE_MULTILINE; int study_opts = 0; double time_diff; pthread_t *workers = NULL; int workers_len; set_log_level(LOG_LEVEL_WARN); work_queue = NULL; work_queue_tail = NULL; memset(&stats, 0, sizeof(stats)); root_ignores = init_ignore(NULL, "", 0); out_fd = stdout; #ifdef USE_PCRE_JIT int has_jit = 0; pcre_config(PCRE_CONFIG_JIT, &has_jit); if (has_jit) { study_opts |= PCRE_STUDY_JIT_COMPILE; } #endif gettimeofday(&(stats.time_start), NULL); parse_options(argc, argv, &base_paths, &paths); log_debug("PCRE Version: %s", pcre_version()); #ifdef _WIN32 { SYSTEM_INFO si; GetSystemInfo(&si); workers_len = si.dwNumberOfProcessors; } #else workers_len = (int)sysconf(_SC_NPROCESSORS_ONLN); #endif if (opts.literal) { workers_len--; } if (opts.workers) { workers_len = opts.workers; } if (workers_len < 1) { workers_len = 1; } log_debug("Using %i workers", workers_len); done_adding_files = FALSE; workers = ag_calloc(workers_len, sizeof(pthread_t)); if (pthread_cond_init(&files_ready, NULL)) { die("pthread_cond_init failed!"); } if (pthread_mutex_init(&print_mtx, NULL)) { die("pthread_mutex_init failed!"); } if (pthread_mutex_init(&stats_mtx, NULL)) { die("pthread_mutex_init failed!"); } if (pthread_mutex_init(&work_queue_mtx, NULL)) { die("pthread_mutex_init failed!"); } if (opts.casing == CASE_SMART) { opts.casing = is_lowercase(opts.query) ? CASE_INSENSITIVE : CASE_SENSITIVE; } if (opts.literal) { if (opts.casing == CASE_INSENSITIVE) { /* Search routine needs the query to be lowercase */ char *c = opts.query; for (; *c != '\0'; ++c) { *c = (char)tolower(*c); } } generate_alpha_skip(opts.query, opts.query_len, alpha_skip_lookup, opts.casing == CASE_SENSITIVE); find_skip_lookup = NULL; generate_find_skip(opts.query, opts.query_len, &find_skip_lookup, opts.casing == CASE_SENSITIVE); if (opts.word_regexp) { init_wordchar_table(); opts.literal_starts_wordchar = is_wordchar(opts.query[0]); opts.literal_ends_wordchar = is_wordchar(opts.query[opts.query_len - 1]); } } else { if (opts.casing == CASE_INSENSITIVE) { pcre_opts |= PCRE_CASELESS; } if (opts.word_regexp) { char *word_regexp_query; ag_asprintf(&word_regexp_query, "\\b%s\\b", opts.query); free(opts.query); opts.query = word_regexp_query; opts.query_len = strlen(opts.query); } compile_study(&opts.re, &opts.re_extra, opts.query, pcre_opts, study_opts); } if (opts.search_stream) { search_stream(stdin, ""); } else { for (i = 0; i < workers_len; i++) { int rv = pthread_create(&(workers[i]), NULL, &search_file_worker, &i); if (rv != 0) { die("error in pthread_create(): %s", strerror(rv)); } } for (i = 0; paths[i] != NULL; i++) { log_debug("searching path %s for %s", paths[i], opts.query); symhash = NULL; ignores *ig = init_ignore(root_ignores, "", 0); search_dir(ig, base_paths[i], paths[i], 0); cleanup_ignore(ig); } pthread_mutex_lock(&work_queue_mtx); done_adding_files = TRUE; pthread_cond_broadcast(&files_ready); pthread_mutex_unlock(&work_queue_mtx); for (i = 0; i < workers_len; i++) { if (pthread_join(workers[i], NULL)) { die("pthread_join failed!"); } } } if (opts.stats) { gettimeofday(&(stats.time_end), NULL); time_diff = ((long)stats.time_end.tv_sec * 1000000 + stats.time_end.tv_usec) - ((long)stats.time_start.tv_sec * 1000000 + stats.time_start.tv_usec); time_diff /= 1000000; printf("%ld matches\n%ld files searched\n%ld bytes searched\n%f seconds\n", stats.total_matches, stats.total_files, stats.total_bytes, time_diff); } if (opts.pager) { pclose(out_fd); } cleanup_options(); pthread_cond_destroy(&files_ready); pthread_mutex_destroy(&work_queue_mtx); pthread_mutex_destroy(&stats_mtx); pthread_mutex_destroy(&print_mtx); cleanup_ignore(root_ignores); free(workers); for (i = 0; paths[i] != NULL; i++) { free(paths[i]); free(base_paths[i]); } free(base_paths); free(paths); if (find_skip_lookup) { free(find_skip_lookup); } return !opts.match_found; }
void search_buf(char *buf, size_t buf_len, const char *dir_full_path, char *tmp_file_path) { int binary = -1; /* 1 = yes, 0 = no, -1 = don't know */ size_t buf_offset = 0; //if (opts.search_stream) { // binary = 0; //} //else if (!opts.search_binary_files) { // binary = is_binary(buf, buf_len); // if (binary) { // log_debug("File %s is binary. Skipping...", dir_full_path); // if (!convert_to_text(&buf, &buf_len, dir_full_path, tmp_file_path)) // return; // } //} binary = is_binary(buf, buf_len); if (binary) { log_debug("File %s is binary. Convert to text...", dir_full_path); if (!convert_to_text(&buf, &buf_len, dir_full_path, tmp_file_path)) return; } size_t matches_len = 0; match_t *matches; size_t matches_size; size_t matches_spare; //if (opts.invert_match) { // /* If we are going to invert the set of matches at the end, we will need // * one extra match struct, even if there are no matches at all. So make // * sure we have a nonempty array; and make sure we always have spare // * capacity for one extra. // */ // matches_size = 100; // matches = (match_t *)ag_malloc(matches_size * sizeof(match_t)); // matches_spare = 1; //} //else { // matches_size = 0; // matches = NULL; // matches_spare = 0; //} matches_size = 0; matches = NULL; matches_spare = 0; if (/*!opts.literal && */opts.query_len == 1 && opts.query[0] == '.') { matches_size = 1; matches = (match_t *)ag_malloc(matches_size * sizeof(match_t)); matches[0].start = 0; matches[0].end = buf_len; matches_len = 1; } else { const char *match_ptr = buf; strncmp_fp ag_strnstr_fp = get_strstr(opts.casing); while (buf_offset < buf_len) { match_ptr = ag_strnstr_fp(match_ptr, opts.query, buf_len - buf_offset, opts.query_len, alpha_skip_lookup, find_skip_lookup); if (match_ptr == NULL) { break; } if (opts.word_regexp) { const char *start = match_ptr; const char *end = match_ptr + opts.query_len; /* Check whether both start and end of the match lie on a word * boundary */ if ((start == buf || is_wordchar(*(start - 1)) != opts.literal_starts_wordchar) && (end == buf + buf_len || is_wordchar(*end) != opts.literal_ends_wordchar)) { /* It's a match */ } else { /* It's not a match */ match_ptr += opts.query_len; buf_offset = end - buf; continue; } } if (matches_len + matches_spare >= matches_size) { /* TODO: benchmark initial size of matches. 100 may be too small/big */ matches_size = matches ? matches_size * 2 : 100; log_debug("Too many matches in %s. Reallocating matches to %zu.", dir_full_path, matches_size); matches = (match_t *)ag_realloc(matches, matches_size * sizeof(match_t)); } matches[matches_len].start = match_ptr - buf; matches[matches_len].end = matches[matches_len].start + opts.query_len; buf_offset = matches[matches_len].end; log_debug("Match found. File %s, offset %lu bytes.", dir_full_path, matches[matches_len].start); matches_len++; match_ptr += opts.query_len; if (opts.max_matches_per_file > 0 && matches_len >= opts.max_matches_per_file) { log_err("Too many matches in %s. Skipping the rest of this file.", dir_full_path); break; } } } else {