Ejemplo n.º 1
0
	RE2Regex(const char* string, unsigned int options): casefold(false)
	{
		RE2::Options opts;
		opts.set_posix_syntax(true);
		opts.set_perl_classes(true);
		opts.set_word_boundary(true);
		opts.set_one_line(false);
		opts.set_never_nl(true);
		opts.set_literal((options & RO_LITERAL) != 0);
		opts.set_log_errors(false);
		
		std::string pattern;
		if ((options & RO_IGNORECASE) && transformRegexCasefold(string, pattern, (options & RO_LITERAL) != 0))
		{
			casefold = true;
		}
		else
		{
			pattern = string;
			opts.set_case_sensitive((options & RO_IGNORECASE) == 0);
		}
		
		re.reset(new RE2(pattern, opts));
		if (!re->ok())
			throw std::runtime_error("Error parsing regular expression " + (string + (": " + re->error())));

		std::string prefix = getPrefix(re.get(), 128);

		if (prefix.length() == 1)
			matcher.reset(new LiteralMatcherFirst(prefix.c_str()));
	#ifdef USE_SSE2
		else if (prefix.length() > 1)
			matcher.reset(new LiteralMatcherSSE(prefix.c_str()));
	#endif
	}
Ejemplo n.º 2
0
static void cpy_options(RE2::Options &options, JNIEnv *env, jobject j_options) {
    assert(j_options != 0);
    jclass j_options_cls = env->GetObjectClass(j_options);
    options.set_encoding(get_re2_encoding(env, env->GetObjectField(j_options, get_field_id_safe(env, j_options_cls, "encoding", "Lcom/logentries/re2/Encoding;"))));
    options.set_posix_syntax(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "posixSyntax", "Z")));
    options.set_longest_match(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "longestMatch", "Z")));
    options.set_log_errors(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "logErrors", "Z")));
    options.set_max_mem(safe_cast<uint64_t>(env->GetLongField(j_options, get_field_id_safe(env, j_options_cls, "maxMem", "J"))));
    options.set_literal(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "literal", "Z")));
    options.set_never_nl(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverNl", "Z")));
    options.set_never_capture(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverCapture", "Z")));
    options.set_case_sensitive(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "caseSensitive", "Z")));
    options.set_perl_classes(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "perlClasses", "Z")));
    options.set_word_boundary(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "wordBoundary", "Z")));
}
Ejemplo n.º 3
0
/*
 * @bbeveridge
 * ExecuteSearch has gotten very hairy, and should be #refactored.
 * The intended logic goes as such
 * - If it is valid to search using trigram indexing, then:
 *   1) Search using trigrams.  To avoid rehitting stalefiles, add
 *      every searched file to the staleFilesThatHaveBeenSearched set
 *   2) Skip reading the archive entirely
 *   3) Do the same handling for stalefiles & wait for regex streams
 *
 * - Search the archive by:
 *   1) iterating all files in the archive, if a file is deleted, skip
 *      it.  If a file is 'stale' then skip it in the archive & search
 *      the actual file on disk
 *
 * - Directly search any files that have been added to disk & are
 *   therefore not indexed.
 * - Wait for the RE2 threads to complete
 */
void ExecuteSearch(GrepParams* param)
{
  struct archive_entry *entry;
  int r;
  StringSet staleFilesThatHaveBeenSearched;
  
  LoadStaleSets(param->sourceArchiveName);
  
  ConsumerThreadContext* context = new ConsumerThreadContext();
  Stream* dataStream = CreateStream(param->streamBlockSize, param->streamBlockCount);
  context->dataStream = dataStream;
  context->callbackFunction = param->callbackFunction;
  context->callbackContext = param->callbackContext;
  RE2::Options options;
  options.set_case_sensitive(param->caseSensitive);
  options.set_literal(param->regexIsLiteral);
  context->pattern = new RE2(param->searchPattern, options);
  if (context->pattern->ok() == false)
  {
      printf("FATAL: Primary regex expression has an error : %s\n", context->pattern->error().c_str());
      exit(1);
  }
  if (param->secondPhasePattern)
  {
      RE2::Options nocase;
      nocase.set_case_sensitive(false);
      context->secondPhasePattern = new RE2(param->secondPhasePattern, nocase);
      
      if (context->secondPhasePattern->ok() == false)
      {
          printf("FATAL: Secondary regex expression has an error : %s\n", context->secondPhasePattern->error().c_str());
          exit(1);
      }
  }
  
  thread* consumer = launch(grepThreadFn, context);
  
  struct QArchive cacheQArchive;
  cacheQArchive.a = NULL;
  std::string baseDirectory = GetBaseFromFilename(param->sourceArchiveName).c_str();
  if (!param->searchFilenames && !param->ignoreTrigrams)
  {
      // trigram search 
      TrigramContext tri_context;
      tri_context.dataStream = dataStream;
      tri_context.context = context;
      tri_context.qa = &cacheQArchive;
      tri_context.handledFileSet = &staleFilesThatHaveBeenSearched;
      char trifile[1024];
      sprintf(trifile, "%s.tris", param->sourceArchiveName);
  
      TrigramSplitter* ts = trigram_load_from_file(trifile);
      if (ts && trigram_string_is_searchable(param->searchPattern))
      {
	  if (trigram_iterate_matching_files(ts, param->searchPattern, &tri_context, trigram_callback, 0))
	  {
	      goto skip_archive;
	  }
	  else
	  {
	      //printf("exiting trigram search early!\n");
	  }
      }
  }
  
  {
      struct archive* cacheArchive = archive_read_new();
      cacheQArchive.a = cacheArchive;
  
      // archive handling
      #if ARCHIVE_VERSION_NUMBER < 3000000
      archive_read_support_compression_all(cacheArchive);
      #else
      archive_read_support_filter_all(cacheArchive);
      #endif
      archive_read_support_format_all(cacheArchive);
      r = archive_read_open_filename(cacheArchive, param->sourceArchiveName, 10240); 
      if (r != ARCHIVE_OK)
      {
	  printf("FATAL: %s", archive_error_string(cacheArchive));
	  exit(1);
      }
  
      while (archive_read_next_header(cacheArchive, &entry) == ARCHIVE_OK) {
	  const char* entryName = archive_entry_pathname(entry);
      
	  // Don't return results from deleted files
	  if (SetContains(gDeletedFiles, entryName))
	  {
	      archive_read_data_skip(cacheArchive); 
	      continue;
	  }
      
	  // Handle file name search
	  if (param->searchFilenames)
	  {
	      if (RE2::PartialMatch(entryName, *(context->pattern)))
	      {
		  param->callbackFunction(param->callbackContext, entryName, 1, 0, 0);
		  staleFilesThatHaveBeenSearched.insert(strdup(entryName));
	      }
	      archive_read_data_skip(cacheArchive); 
	      continue;
	  }
      
	  FILE* file = NULL;
	  /* Handle files that are stale in the cache */
	  if (SetContains(gStaleFiles, entryName))
	  {
	      staleFilesThatHaveBeenSearched.insert(strdup(entryName));
	      file = OpenFile(baseDirectory, entryName);
	      if (!file)
	      {
		  printf("[WARN] Unable to open %s, falling back to cache\n", entryName);
	      }
	      else
	      {
		  archive_read_data_skip(cacheArchive); 
	      }
	  }
      
	  ExecuteContentSearch(dataStream, &cacheQArchive, file, entry, entryName, context);
      
	  if (file)
	  {
	      fclose(file);
	      file = NULL;
	  }
      }
      #if ARCHIVE_VERSION_NUMBER < 3000000
      r = archive_read_finish(cacheArchive);  
      #else
      r = archive_read_free(cacheArchive);
      #endif
      if (r != ARCHIVE_OK)
      {
	  printf("archive_read_finish didn't finish properly\n");
	  exit(1);
      }
  }
  
skip_archive: 
  // Handle added files
  StringSet::iterator end = gStaleFiles.end();
  for (StringSet::iterator i = gStaleFiles.begin(); i != end; ++i)
  {
      if (SetContains(staleFilesThatHaveBeenSearched, *i))
	  continue;
      
      if (param->searchFilenames)
      {
	  if (RE2::PartialMatch(*i, *(context->pattern)))
	  {
	      param->callbackFunction(param->callbackContext, *i, 1, 0, 0);
	  }
	  continue;
      }

      FILE* file = OpenFile(baseDirectory, *i);
      if (file)
      {
	  ExecuteContentSearch(dataStream, &cacheQArchive, file, NULL, *i, context);
	  fclose(file);
	  file = NULL;
      }
  }

  unsigned int blockSize;
  void* rawBlock = GetWriteBlock(dataStream, &blockSize);
  NamedDataBlock* endBlock = CreateNamedDataBlock(NULL, rawBlock, blockSize);
  SetUsedDataSize(endBlock, 0);
  PutWriteBlock(dataStream);
  //printf("Waiting on join\n");
  join(consumer);
  
  delete context->pattern;
  delete context;
  DestroyStream(dataStream);
  //printf("gFallback %d gFallbackExpands %d\n", gFallback, gFallbackExpands);
}