Example #1
0
/**
  reads all the regular expressions from the database.
  and compile them
 */
void
RegexManager::load_config(YAML::Node cfg)
{
   try
   {

     TSDebug(BANJAX_PLUGIN_NAME, "Setting regex re2 options");
     RE2::Options opt;
     opt.set_log_errors(false);
     opt.set_perl_classes(true);
     opt.set_posix_syntax(true);

     TSDebug(BANJAX_PLUGIN_NAME, "Loading regex manager conf");
     //now we compile all of them and store them for later use
     for(YAML::const_iterator it = cfg.begin(); it != cfg.end(); ++it) {
       string cur_rule = (const char*) (*it)["rule"].as<std::string>().c_str();
       TSDebug(BANJAX_PLUGIN_NAME, "initiating rule %s", cur_rule.c_str());

       unsigned int observation_interval = (*it)["interval"].as<unsigned int>();
       unsigned int threshold  = (*it)["hits_per_interval"].as<unsigned int>();

       rated_banning_regexes.push_back(new RatedRegex(cur_rule, new RE2((const char*)((*it)["regex"].as<std::string>().c_str()), opt), observation_interval * 1000, threshold /(double)(observation_interval* 1000)));

     }
    }
   catch(YAML::RepresentationException& e)
     {
       TSDebug(BANJAX_PLUGIN_NAME, "Error loading regex manager conf [%s].", e.what());
	return;
     }
     TSDebug(BANJAX_PLUGIN_NAME, "Done loading regex manager conf");

}
Example #2
0
	RE2Regex(const char* string, unsigned int options): casefold(false)
	{
		RE2::Options opts;
		opts.set_posix_syntax(true);
		opts.set_perl_classes(true);
		opts.set_word_boundary(true);
		opts.set_one_line(false);
		opts.set_never_nl(true);
		opts.set_literal((options & RO_LITERAL) != 0);
		opts.set_log_errors(false);
		
		std::string pattern;
		if ((options & RO_IGNORECASE) && transformRegexCasefold(string, pattern, (options & RO_LITERAL) != 0))
		{
			casefold = true;
		}
		else
		{
			pattern = string;
			opts.set_case_sensitive((options & RO_IGNORECASE) == 0);
		}
		
		re.reset(new RE2(pattern, opts));
		if (!re->ok())
			throw std::runtime_error("Error parsing regular expression " + (string + (": " + re->error())));

		std::string prefix = getPrefix(re.get(), 128);

		if (prefix.length() == 1)
			matcher.reset(new LiteralMatcherFirst(prefix.c_str()));
	#ifdef USE_SSE2
		else if (prefix.length() > 1)
			matcher.reset(new LiteralMatcherSSE(prefix.c_str()));
	#endif
	}
Example #3
0
  unsigned long mlre2__custom_regex_deserialize(void * dst) {
    int len = caml_deserialize_sint_4();
    RE2::Options options;
    char * pattern = (char *) caml_stat_alloc(sizeof(*pattern) * (len));
    caml_deserialize_block_1(pattern, len);
    pattern[len - 1] = '\0';
    options.Copy(RE2::Quiet);
    options.set_max_mem(caml_deserialize_sint_8());
    options_of_bitfield((uint16_t) caml_deserialize_uint_2(), options);
#ifdef DEBUG
    std::cerr << "deserialized regex /" << pattern << "/" << std::endl;
#endif
    *(RE2 **) dst = new RE2(pattern, options);
    caml_stat_free(pattern);
    return sizeof(RE2 *);
  }
Example #4
0
  /* returns (cre2__obj_t * int * (string * int) list) where
   * - cre2__obj_t is the ML-side name for a custom_block with a struct regex *
   * - int is the number of submatches, including the whole match
   * - (string * int) list is the Map.to_alist of the submatch (name, index) Map.t
   */
  CAMLprim value mlre2__create_re(value v_options, value v_pattern) {
    value v_retval, v_compile_error;
    const char * c_pat = String_val(v_pattern);
    RE2::Options opt;
    RE2* compiled = NULL;

    opt.Copy(RE2::Quiet);
    while (v_options != Val_emptylist) {
      int val = Int_val(Field(Field(v_options, 0), 0));
      switch (Tag_val(Field(v_options, 0))) {
#define X(_u,FIRST,REST,_uu) case FIRST##REST : opt.set_##FIRST##REST(val); break;
#define X__ENCODING(_u,FIRST,REST,_uu,SUFFIX,_uuu,TRANSLATED)               \
        case FIRST##REST##SUFFIX : opt.set_##FIRST##REST(val TRANSLATED); break;
#define X__MAXMEM(_u,FIRST,REST,_uu) X(_u,FIRST,REST,_uu)
#include "enum_x_macro.h"
      default              : caml_invalid_argument("invalid option\n");
      }
      v_options = Field(v_options, 1);
    }

    compiled = new RE2(c_pat, opt);

    if (!compiled->ok()) {
      /* Warning
         from this point on it's no longer safe to access v_options or
         v_pattern as the GC might be invoked from caml_copy_string and
         move those values (as we haven't registered the paramters they
         wouldn't get updated).  This is fine because we don't access
         them before we call caml_raise_with_arg. */
      v_compile_error = caml_copy_string(compiled->error().c_str());
      delete compiled;
      compiled = NULL;
      caml_raise_with_arg(*caml_named_value("mlre2__Regex_compile_failed"),
          v_compile_error);
    }

    v_retval = caml_alloc_custom(&mlre2__custom_regex_ops, sizeof(compiled),
        1024*1024,      /* RE2 object uses ~1MB of memory outside the OCaml heap */
        500*1024*1024);  /* I'm okay with 500MB of RAM being wasted */

    Regex_val(v_retval) = compiled;

    return v_retval;
  }
Example #5
0
JNIEXPORT void JNICALL Java_com_logentries_re2_Options_setDefaults
  (JNIEnv *env, jobject j_this) {
    RE2::Options options;
    jclass j_cls = env->GetObjectClass(j_this);
    env->SetObjectField(j_this, get_field_id_safe(env, j_cls, "encoding", "Lcom/logentries/re2/Encoding;"), get_j_encoding(env, options.encoding()));
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "posixSyntax", "Z"), options.posix_syntax());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "longestMatch", "Z"), options.longest_match());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "logErrors", "Z"), options.log_errors());
    env->SetLongField(j_this, get_field_id_safe(env, j_cls, "maxMem", "J"), safe_cast<jlong>(options.max_mem()));
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "literal", "Z"), options.literal());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "neverNl", "Z"), options.never_nl());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "neverCapture", "Z"), options.never_capture());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "caseSensitive", "Z"), options.case_sensitive());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "perlClasses", "Z"), options.perl_classes());
    env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "wordBoundary", "Z"), options.word_boundary());
}
Example #6
0
/*
 * @bbeveridge
 * ExecuteSearch has gotten very hairy, and should be #refactored.
 * The intended logic goes as such
 * - If it is valid to search using trigram indexing, then:
 *   1) Search using trigrams.  To avoid rehitting stalefiles, add
 *      every searched file to the staleFilesThatHaveBeenSearched set
 *   2) Skip reading the archive entirely
 *   3) Do the same handling for stalefiles & wait for regex streams
 *
 * - Search the archive by:
 *   1) iterating all files in the archive, if a file is deleted, skip
 *      it.  If a file is 'stale' then skip it in the archive & search
 *      the actual file on disk
 *
 * - Directly search any files that have been added to disk & are
 *   therefore not indexed.
 * - Wait for the RE2 threads to complete
 */
void ExecuteSearch(GrepParams* param)
{
  struct archive_entry *entry;
  int r;
  StringSet staleFilesThatHaveBeenSearched;
  
  LoadStaleSets(param->sourceArchiveName);
  
  ConsumerThreadContext* context = new ConsumerThreadContext();
  Stream* dataStream = CreateStream(param->streamBlockSize, param->streamBlockCount);
  context->dataStream = dataStream;
  context->callbackFunction = param->callbackFunction;
  context->callbackContext = param->callbackContext;
  RE2::Options options;
  options.set_case_sensitive(param->caseSensitive);
  options.set_literal(param->regexIsLiteral);
  context->pattern = new RE2(param->searchPattern, options);
  if (context->pattern->ok() == false)
  {
      printf("FATAL: Primary regex expression has an error : %s\n", context->pattern->error().c_str());
      exit(1);
  }
  if (param->secondPhasePattern)
  {
      RE2::Options nocase;
      nocase.set_case_sensitive(false);
      context->secondPhasePattern = new RE2(param->secondPhasePattern, nocase);
      
      if (context->secondPhasePattern->ok() == false)
      {
          printf("FATAL: Secondary regex expression has an error : %s\n", context->secondPhasePattern->error().c_str());
          exit(1);
      }
  }
  
  thread* consumer = launch(grepThreadFn, context);
  
  struct QArchive cacheQArchive;
  cacheQArchive.a = NULL;
  std::string baseDirectory = GetBaseFromFilename(param->sourceArchiveName).c_str();
  if (!param->searchFilenames && !param->ignoreTrigrams)
  {
      // trigram search 
      TrigramContext tri_context;
      tri_context.dataStream = dataStream;
      tri_context.context = context;
      tri_context.qa = &cacheQArchive;
      tri_context.handledFileSet = &staleFilesThatHaveBeenSearched;
      char trifile[1024];
      sprintf(trifile, "%s.tris", param->sourceArchiveName);
  
      TrigramSplitter* ts = trigram_load_from_file(trifile);
      if (ts && trigram_string_is_searchable(param->searchPattern))
      {
	  if (trigram_iterate_matching_files(ts, param->searchPattern, &tri_context, trigram_callback, 0))
	  {
	      goto skip_archive;
	  }
	  else
	  {
	      //printf("exiting trigram search early!\n");
	  }
      }
  }
  
  {
      struct archive* cacheArchive = archive_read_new();
      cacheQArchive.a = cacheArchive;
  
      // archive handling
      #if ARCHIVE_VERSION_NUMBER < 3000000
      archive_read_support_compression_all(cacheArchive);
      #else
      archive_read_support_filter_all(cacheArchive);
      #endif
      archive_read_support_format_all(cacheArchive);
      r = archive_read_open_filename(cacheArchive, param->sourceArchiveName, 10240); 
      if (r != ARCHIVE_OK)
      {
	  printf("FATAL: %s", archive_error_string(cacheArchive));
	  exit(1);
      }
  
      while (archive_read_next_header(cacheArchive, &entry) == ARCHIVE_OK) {
	  const char* entryName = archive_entry_pathname(entry);
      
	  // Don't return results from deleted files
	  if (SetContains(gDeletedFiles, entryName))
	  {
	      archive_read_data_skip(cacheArchive); 
	      continue;
	  }
      
	  // Handle file name search
	  if (param->searchFilenames)
	  {
	      if (RE2::PartialMatch(entryName, *(context->pattern)))
	      {
		  param->callbackFunction(param->callbackContext, entryName, 1, 0, 0);
		  staleFilesThatHaveBeenSearched.insert(strdup(entryName));
	      }
	      archive_read_data_skip(cacheArchive); 
	      continue;
	  }
      
	  FILE* file = NULL;
	  /* Handle files that are stale in the cache */
	  if (SetContains(gStaleFiles, entryName))
	  {
	      staleFilesThatHaveBeenSearched.insert(strdup(entryName));
	      file = OpenFile(baseDirectory, entryName);
	      if (!file)
	      {
		  printf("[WARN] Unable to open %s, falling back to cache\n", entryName);
	      }
	      else
	      {
		  archive_read_data_skip(cacheArchive); 
	      }
	  }
      
	  ExecuteContentSearch(dataStream, &cacheQArchive, file, entry, entryName, context);
      
	  if (file)
	  {
	      fclose(file);
	      file = NULL;
	  }
      }
      #if ARCHIVE_VERSION_NUMBER < 3000000
      r = archive_read_finish(cacheArchive);  
      #else
      r = archive_read_free(cacheArchive);
      #endif
      if (r != ARCHIVE_OK)
      {
	  printf("archive_read_finish didn't finish properly\n");
	  exit(1);
      }
  }
  
skip_archive: 
  // Handle added files
  StringSet::iterator end = gStaleFiles.end();
  for (StringSet::iterator i = gStaleFiles.begin(); i != end; ++i)
  {
      if (SetContains(staleFilesThatHaveBeenSearched, *i))
	  continue;
      
      if (param->searchFilenames)
      {
	  if (RE2::PartialMatch(*i, *(context->pattern)))
	  {
	      param->callbackFunction(param->callbackContext, *i, 1, 0, 0);
	  }
	  continue;
      }

      FILE* file = OpenFile(baseDirectory, *i);
      if (file)
      {
	  ExecuteContentSearch(dataStream, &cacheQArchive, file, NULL, *i, context);
	  fclose(file);
	  file = NULL;
      }
  }

  unsigned int blockSize;
  void* rawBlock = GetWriteBlock(dataStream, &blockSize);
  NamedDataBlock* endBlock = CreateNamedDataBlock(NULL, rawBlock, blockSize);
  SetUsedDataSize(endBlock, 0);
  PutWriteBlock(dataStream);
  //printf("Waiting on join\n");
  join(consumer);
  
  delete context->pattern;
  delete context;
  DestroyStream(dataStream);
  //printf("gFallback %d gFallbackExpands %d\n", gFallback, gFallbackExpands);
}
Example #7
0
static void cpy_options(RE2::Options &options, JNIEnv *env, jobject j_options) {
    assert(j_options != 0);
    jclass j_options_cls = env->GetObjectClass(j_options);
    options.set_encoding(get_re2_encoding(env, env->GetObjectField(j_options, get_field_id_safe(env, j_options_cls, "encoding", "Lcom/logentries/re2/Encoding;"))));
    options.set_posix_syntax(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "posixSyntax", "Z")));
    options.set_longest_match(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "longestMatch", "Z")));
    options.set_log_errors(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "logErrors", "Z")));
    options.set_max_mem(safe_cast<uint64_t>(env->GetLongField(j_options, get_field_id_safe(env, j_options_cls, "maxMem", "J"))));
    options.set_literal(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "literal", "Z")));
    options.set_never_nl(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverNl", "Z")));
    options.set_never_capture(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverCapture", "Z")));
    options.set_case_sensitive(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "caseSensitive", "Z")));
    options.set_perl_classes(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "perlClasses", "Z")));
    options.set_word_boundary(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "wordBoundary", "Z")));
}