/** reads all the regular expressions from the database. and compile them */ void RegexManager::load_config(YAML::Node cfg) { try { TSDebug(BANJAX_PLUGIN_NAME, "Setting regex re2 options"); RE2::Options opt; opt.set_log_errors(false); opt.set_perl_classes(true); opt.set_posix_syntax(true); TSDebug(BANJAX_PLUGIN_NAME, "Loading regex manager conf"); //now we compile all of them and store them for later use for(YAML::const_iterator it = cfg.begin(); it != cfg.end(); ++it) { string cur_rule = (const char*) (*it)["rule"].as<std::string>().c_str(); TSDebug(BANJAX_PLUGIN_NAME, "initiating rule %s", cur_rule.c_str()); unsigned int observation_interval = (*it)["interval"].as<unsigned int>(); unsigned int threshold = (*it)["hits_per_interval"].as<unsigned int>(); rated_banning_regexes.push_back(new RatedRegex(cur_rule, new RE2((const char*)((*it)["regex"].as<std::string>().c_str()), opt), observation_interval * 1000, threshold /(double)(observation_interval* 1000))); } } catch(YAML::RepresentationException& e) { TSDebug(BANJAX_PLUGIN_NAME, "Error loading regex manager conf [%s].", e.what()); return; } TSDebug(BANJAX_PLUGIN_NAME, "Done loading regex manager conf"); }
RE2Regex(const char* string, unsigned int options): casefold(false) { RE2::Options opts; opts.set_posix_syntax(true); opts.set_perl_classes(true); opts.set_word_boundary(true); opts.set_one_line(false); opts.set_never_nl(true); opts.set_literal((options & RO_LITERAL) != 0); opts.set_log_errors(false); std::string pattern; if ((options & RO_IGNORECASE) && transformRegexCasefold(string, pattern, (options & RO_LITERAL) != 0)) { casefold = true; } else { pattern = string; opts.set_case_sensitive((options & RO_IGNORECASE) == 0); } re.reset(new RE2(pattern, opts)); if (!re->ok()) throw std::runtime_error("Error parsing regular expression " + (string + (": " + re->error()))); std::string prefix = getPrefix(re.get(), 128); if (prefix.length() == 1) matcher.reset(new LiteralMatcherFirst(prefix.c_str())); #ifdef USE_SSE2 else if (prefix.length() > 1) matcher.reset(new LiteralMatcherSSE(prefix.c_str())); #endif }
unsigned long mlre2__custom_regex_deserialize(void * dst) { int len = caml_deserialize_sint_4(); RE2::Options options; char * pattern = (char *) caml_stat_alloc(sizeof(*pattern) * (len)); caml_deserialize_block_1(pattern, len); pattern[len - 1] = '\0'; options.Copy(RE2::Quiet); options.set_max_mem(caml_deserialize_sint_8()); options_of_bitfield((uint16_t) caml_deserialize_uint_2(), options); #ifdef DEBUG std::cerr << "deserialized regex /" << pattern << "/" << std::endl; #endif *(RE2 **) dst = new RE2(pattern, options); caml_stat_free(pattern); return sizeof(RE2 *); }
/* returns (cre2__obj_t * int * (string * int) list) where * - cre2__obj_t is the ML-side name for a custom_block with a struct regex * * - int is the number of submatches, including the whole match * - (string * int) list is the Map.to_alist of the submatch (name, index) Map.t */ CAMLprim value mlre2__create_re(value v_options, value v_pattern) { value v_retval, v_compile_error; const char * c_pat = String_val(v_pattern); RE2::Options opt; RE2* compiled = NULL; opt.Copy(RE2::Quiet); while (v_options != Val_emptylist) { int val = Int_val(Field(Field(v_options, 0), 0)); switch (Tag_val(Field(v_options, 0))) { #define X(_u,FIRST,REST,_uu) case FIRST##REST : opt.set_##FIRST##REST(val); break; #define X__ENCODING(_u,FIRST,REST,_uu,SUFFIX,_uuu,TRANSLATED) \ case FIRST##REST##SUFFIX : opt.set_##FIRST##REST(val TRANSLATED); break; #define X__MAXMEM(_u,FIRST,REST,_uu) X(_u,FIRST,REST,_uu) #include "enum_x_macro.h" default : caml_invalid_argument("invalid option\n"); } v_options = Field(v_options, 1); } compiled = new RE2(c_pat, opt); if (!compiled->ok()) { /* Warning from this point on it's no longer safe to access v_options or v_pattern as the GC might be invoked from caml_copy_string and move those values (as we haven't registered the paramters they wouldn't get updated). This is fine because we don't access them before we call caml_raise_with_arg. */ v_compile_error = caml_copy_string(compiled->error().c_str()); delete compiled; compiled = NULL; caml_raise_with_arg(*caml_named_value("mlre2__Regex_compile_failed"), v_compile_error); } v_retval = caml_alloc_custom(&mlre2__custom_regex_ops, sizeof(compiled), 1024*1024, /* RE2 object uses ~1MB of memory outside the OCaml heap */ 500*1024*1024); /* I'm okay with 500MB of RAM being wasted */ Regex_val(v_retval) = compiled; return v_retval; }
JNIEXPORT void JNICALL Java_com_logentries_re2_Options_setDefaults (JNIEnv *env, jobject j_this) { RE2::Options options; jclass j_cls = env->GetObjectClass(j_this); env->SetObjectField(j_this, get_field_id_safe(env, j_cls, "encoding", "Lcom/logentries/re2/Encoding;"), get_j_encoding(env, options.encoding())); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "posixSyntax", "Z"), options.posix_syntax()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "longestMatch", "Z"), options.longest_match()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "logErrors", "Z"), options.log_errors()); env->SetLongField(j_this, get_field_id_safe(env, j_cls, "maxMem", "J"), safe_cast<jlong>(options.max_mem())); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "literal", "Z"), options.literal()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "neverNl", "Z"), options.never_nl()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "neverCapture", "Z"), options.never_capture()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "caseSensitive", "Z"), options.case_sensitive()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "perlClasses", "Z"), options.perl_classes()); env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "wordBoundary", "Z"), options.word_boundary()); }
/* * @bbeveridge * ExecuteSearch has gotten very hairy, and should be #refactored. * The intended logic goes as such * - If it is valid to search using trigram indexing, then: * 1) Search using trigrams. To avoid rehitting stalefiles, add * every searched file to the staleFilesThatHaveBeenSearched set * 2) Skip reading the archive entirely * 3) Do the same handling for stalefiles & wait for regex streams * * - Search the archive by: * 1) iterating all files in the archive, if a file is deleted, skip * it. If a file is 'stale' then skip it in the archive & search * the actual file on disk * * - Directly search any files that have been added to disk & are * therefore not indexed. * - Wait for the RE2 threads to complete */ void ExecuteSearch(GrepParams* param) { struct archive_entry *entry; int r; StringSet staleFilesThatHaveBeenSearched; LoadStaleSets(param->sourceArchiveName); ConsumerThreadContext* context = new ConsumerThreadContext(); Stream* dataStream = CreateStream(param->streamBlockSize, param->streamBlockCount); context->dataStream = dataStream; context->callbackFunction = param->callbackFunction; context->callbackContext = param->callbackContext; RE2::Options options; options.set_case_sensitive(param->caseSensitive); options.set_literal(param->regexIsLiteral); context->pattern = new RE2(param->searchPattern, options); if (context->pattern->ok() == false) { printf("FATAL: Primary regex expression has an error : %s\n", context->pattern->error().c_str()); exit(1); } if (param->secondPhasePattern) { RE2::Options nocase; nocase.set_case_sensitive(false); context->secondPhasePattern = new RE2(param->secondPhasePattern, nocase); if (context->secondPhasePattern->ok() == false) { printf("FATAL: Secondary regex expression has an error : %s\n", context->secondPhasePattern->error().c_str()); exit(1); } } thread* consumer = launch(grepThreadFn, context); struct QArchive cacheQArchive; cacheQArchive.a = NULL; std::string baseDirectory = GetBaseFromFilename(param->sourceArchiveName).c_str(); if (!param->searchFilenames && !param->ignoreTrigrams) { // trigram search TrigramContext tri_context; tri_context.dataStream = dataStream; tri_context.context = context; tri_context.qa = &cacheQArchive; tri_context.handledFileSet = &staleFilesThatHaveBeenSearched; char trifile[1024]; sprintf(trifile, "%s.tris", param->sourceArchiveName); TrigramSplitter* ts = trigram_load_from_file(trifile); if (ts && trigram_string_is_searchable(param->searchPattern)) { if (trigram_iterate_matching_files(ts, param->searchPattern, &tri_context, trigram_callback, 0)) { goto skip_archive; } else { //printf("exiting trigram search early!\n"); } } } { struct archive* cacheArchive = archive_read_new(); cacheQArchive.a = cacheArchive; // archive handling #if ARCHIVE_VERSION_NUMBER < 3000000 archive_read_support_compression_all(cacheArchive); #else archive_read_support_filter_all(cacheArchive); #endif archive_read_support_format_all(cacheArchive); r = archive_read_open_filename(cacheArchive, param->sourceArchiveName, 10240); if (r != ARCHIVE_OK) { printf("FATAL: %s", archive_error_string(cacheArchive)); exit(1); } while (archive_read_next_header(cacheArchive, &entry) == ARCHIVE_OK) { const char* entryName = archive_entry_pathname(entry); // Don't return results from deleted files if (SetContains(gDeletedFiles, entryName)) { archive_read_data_skip(cacheArchive); continue; } // Handle file name search if (param->searchFilenames) { if (RE2::PartialMatch(entryName, *(context->pattern))) { param->callbackFunction(param->callbackContext, entryName, 1, 0, 0); staleFilesThatHaveBeenSearched.insert(strdup(entryName)); } archive_read_data_skip(cacheArchive); continue; } FILE* file = NULL; /* Handle files that are stale in the cache */ if (SetContains(gStaleFiles, entryName)) { staleFilesThatHaveBeenSearched.insert(strdup(entryName)); file = OpenFile(baseDirectory, entryName); if (!file) { printf("[WARN] Unable to open %s, falling back to cache\n", entryName); } else { archive_read_data_skip(cacheArchive); } } ExecuteContentSearch(dataStream, &cacheQArchive, file, entry, entryName, context); if (file) { fclose(file); file = NULL; } } #if ARCHIVE_VERSION_NUMBER < 3000000 r = archive_read_finish(cacheArchive); #else r = archive_read_free(cacheArchive); #endif if (r != ARCHIVE_OK) { printf("archive_read_finish didn't finish properly\n"); exit(1); } } skip_archive: // Handle added files StringSet::iterator end = gStaleFiles.end(); for (StringSet::iterator i = gStaleFiles.begin(); i != end; ++i) { if (SetContains(staleFilesThatHaveBeenSearched, *i)) continue; if (param->searchFilenames) { if (RE2::PartialMatch(*i, *(context->pattern))) { param->callbackFunction(param->callbackContext, *i, 1, 0, 0); } continue; } FILE* file = OpenFile(baseDirectory, *i); if (file) { ExecuteContentSearch(dataStream, &cacheQArchive, file, NULL, *i, context); fclose(file); file = NULL; } } unsigned int blockSize; void* rawBlock = GetWriteBlock(dataStream, &blockSize); NamedDataBlock* endBlock = CreateNamedDataBlock(NULL, rawBlock, blockSize); SetUsedDataSize(endBlock, 0); PutWriteBlock(dataStream); //printf("Waiting on join\n"); join(consumer); delete context->pattern; delete context; DestroyStream(dataStream); //printf("gFallback %d gFallbackExpands %d\n", gFallback, gFallbackExpands); }
static void cpy_options(RE2::Options &options, JNIEnv *env, jobject j_options) { assert(j_options != 0); jclass j_options_cls = env->GetObjectClass(j_options); options.set_encoding(get_re2_encoding(env, env->GetObjectField(j_options, get_field_id_safe(env, j_options_cls, "encoding", "Lcom/logentries/re2/Encoding;")))); options.set_posix_syntax(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "posixSyntax", "Z"))); options.set_longest_match(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "longestMatch", "Z"))); options.set_log_errors(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "logErrors", "Z"))); options.set_max_mem(safe_cast<uint64_t>(env->GetLongField(j_options, get_field_id_safe(env, j_options_cls, "maxMem", "J")))); options.set_literal(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "literal", "Z"))); options.set_never_nl(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverNl", "Z"))); options.set_never_capture(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverCapture", "Z"))); options.set_case_sensitive(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "caseSensitive", "Z"))); options.set_perl_classes(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "perlClasses", "Z"))); options.set_word_boundary(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "wordBoundary", "Z"))); }