RE2Regex(const char* string, unsigned int options): casefold(false) { RE2::Options opts; opts.set_posix_syntax(true); opts.set_perl_classes(true); opts.set_word_boundary(true); opts.set_one_line(false); opts.set_never_nl(true); opts.set_literal((options & RO_LITERAL) != 0); opts.set_log_errors(false); std::string pattern; if ((options & RO_IGNORECASE) && transformRegexCasefold(string, pattern, (options & RO_LITERAL) != 0)) { casefold = true; } else { pattern = string; opts.set_case_sensitive((options & RO_IGNORECASE) == 0); } re.reset(new RE2(pattern, opts)); if (!re->ok()) throw std::runtime_error("Error parsing regular expression " + (string + (": " + re->error()))); std::string prefix = getPrefix(re.get(), 128); if (prefix.length() == 1) matcher.reset(new LiteralMatcherFirst(prefix.c_str())); #ifdef USE_SSE2 else if (prefix.length() > 1) matcher.reset(new LiteralMatcherSSE(prefix.c_str())); #endif }
static void cpy_options(RE2::Options &options, JNIEnv *env, jobject j_options) { assert(j_options != 0); jclass j_options_cls = env->GetObjectClass(j_options); options.set_encoding(get_re2_encoding(env, env->GetObjectField(j_options, get_field_id_safe(env, j_options_cls, "encoding", "Lcom/logentries/re2/Encoding;")))); options.set_posix_syntax(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "posixSyntax", "Z"))); options.set_longest_match(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "longestMatch", "Z"))); options.set_log_errors(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "logErrors", "Z"))); options.set_max_mem(safe_cast<uint64_t>(env->GetLongField(j_options, get_field_id_safe(env, j_options_cls, "maxMem", "J")))); options.set_literal(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "literal", "Z"))); options.set_never_nl(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverNl", "Z"))); options.set_never_capture(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverCapture", "Z"))); options.set_case_sensitive(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "caseSensitive", "Z"))); options.set_perl_classes(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "perlClasses", "Z"))); options.set_word_boundary(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "wordBoundary", "Z"))); }
/* * @bbeveridge * ExecuteSearch has gotten very hairy, and should be #refactored. * The intended logic goes as such * - If it is valid to search using trigram indexing, then: * 1) Search using trigrams. To avoid rehitting stalefiles, add * every searched file to the staleFilesThatHaveBeenSearched set * 2) Skip reading the archive entirely * 3) Do the same handling for stalefiles & wait for regex streams * * - Search the archive by: * 1) iterating all files in the archive, if a file is deleted, skip * it. If a file is 'stale' then skip it in the archive & search * the actual file on disk * * - Directly search any files that have been added to disk & are * therefore not indexed. * - Wait for the RE2 threads to complete */ void ExecuteSearch(GrepParams* param) { struct archive_entry *entry; int r; StringSet staleFilesThatHaveBeenSearched; LoadStaleSets(param->sourceArchiveName); ConsumerThreadContext* context = new ConsumerThreadContext(); Stream* dataStream = CreateStream(param->streamBlockSize, param->streamBlockCount); context->dataStream = dataStream; context->callbackFunction = param->callbackFunction; context->callbackContext = param->callbackContext; RE2::Options options; options.set_case_sensitive(param->caseSensitive); options.set_literal(param->regexIsLiteral); context->pattern = new RE2(param->searchPattern, options); if (context->pattern->ok() == false) { printf("FATAL: Primary regex expression has an error : %s\n", context->pattern->error().c_str()); exit(1); } if (param->secondPhasePattern) { RE2::Options nocase; nocase.set_case_sensitive(false); context->secondPhasePattern = new RE2(param->secondPhasePattern, nocase); if (context->secondPhasePattern->ok() == false) { printf("FATAL: Secondary regex expression has an error : %s\n", context->secondPhasePattern->error().c_str()); exit(1); } } thread* consumer = launch(grepThreadFn, context); struct QArchive cacheQArchive; cacheQArchive.a = NULL; std::string baseDirectory = GetBaseFromFilename(param->sourceArchiveName).c_str(); if (!param->searchFilenames && !param->ignoreTrigrams) { // trigram search TrigramContext tri_context; tri_context.dataStream = dataStream; tri_context.context = context; tri_context.qa = &cacheQArchive; tri_context.handledFileSet = &staleFilesThatHaveBeenSearched; char trifile[1024]; sprintf(trifile, "%s.tris", param->sourceArchiveName); TrigramSplitter* ts = trigram_load_from_file(trifile); if (ts && trigram_string_is_searchable(param->searchPattern)) { if (trigram_iterate_matching_files(ts, param->searchPattern, &tri_context, trigram_callback, 0)) { goto skip_archive; } else { //printf("exiting trigram search early!\n"); } } } { struct archive* cacheArchive = archive_read_new(); cacheQArchive.a = cacheArchive; // archive handling #if ARCHIVE_VERSION_NUMBER < 3000000 archive_read_support_compression_all(cacheArchive); #else archive_read_support_filter_all(cacheArchive); #endif archive_read_support_format_all(cacheArchive); r = archive_read_open_filename(cacheArchive, param->sourceArchiveName, 10240); if (r != ARCHIVE_OK) { printf("FATAL: %s", archive_error_string(cacheArchive)); exit(1); } while (archive_read_next_header(cacheArchive, &entry) == ARCHIVE_OK) { const char* entryName = archive_entry_pathname(entry); // Don't return results from deleted files if (SetContains(gDeletedFiles, entryName)) { archive_read_data_skip(cacheArchive); continue; } // Handle file name search if (param->searchFilenames) { if (RE2::PartialMatch(entryName, *(context->pattern))) { param->callbackFunction(param->callbackContext, entryName, 1, 0, 0); staleFilesThatHaveBeenSearched.insert(strdup(entryName)); } archive_read_data_skip(cacheArchive); continue; } FILE* file = NULL; /* Handle files that are stale in the cache */ if (SetContains(gStaleFiles, entryName)) { staleFilesThatHaveBeenSearched.insert(strdup(entryName)); file = OpenFile(baseDirectory, entryName); if (!file) { printf("[WARN] Unable to open %s, falling back to cache\n", entryName); } else { archive_read_data_skip(cacheArchive); } } ExecuteContentSearch(dataStream, &cacheQArchive, file, entry, entryName, context); if (file) { fclose(file); file = NULL; } } #if ARCHIVE_VERSION_NUMBER < 3000000 r = archive_read_finish(cacheArchive); #else r = archive_read_free(cacheArchive); #endif if (r != ARCHIVE_OK) { printf("archive_read_finish didn't finish properly\n"); exit(1); } } skip_archive: // Handle added files StringSet::iterator end = gStaleFiles.end(); for (StringSet::iterator i = gStaleFiles.begin(); i != end; ++i) { if (SetContains(staleFilesThatHaveBeenSearched, *i)) continue; if (param->searchFilenames) { if (RE2::PartialMatch(*i, *(context->pattern))) { param->callbackFunction(param->callbackContext, *i, 1, 0, 0); } continue; } FILE* file = OpenFile(baseDirectory, *i); if (file) { ExecuteContentSearch(dataStream, &cacheQArchive, file, NULL, *i, context); fclose(file); file = NULL; } } unsigned int blockSize; void* rawBlock = GetWriteBlock(dataStream, &blockSize); NamedDataBlock* endBlock = CreateNamedDataBlock(NULL, rawBlock, blockSize); SetUsedDataSize(endBlock, 0); PutWriteBlock(dataStream); //printf("Waiting on join\n"); join(consumer); delete context->pattern; delete context; DestroyStream(dataStream); //printf("gFallback %d gFallbackExpands %d\n", gFallback, gFallbackExpands); }