Exemple #1
0
gint
rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache,
		const char *cache_dir, gdouble max_time, gboolean silent,
		GError **err)
{
	g_assert (cache != NULL);
	g_assert (cache_dir != NULL);

#ifndef WITH_HYPERSCAN
	g_set_error (err, rspamd_re_cache_quark (), EINVAL, "hyperscan is disabled");
	return -1;
#else
	GHashTableIter it, cit;
	gpointer k, v;
	struct rspamd_re_class *re_class;
	gchar path[PATH_MAX];
	hs_database_t *test_db;
	gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags;
	guint64 crc;
	rspamd_regexp_t *re;
	hs_compile_error_t *hs_errors;
	guint *hs_flags = NULL;
	const gchar **hs_pats = NULL;
	gchar *hs_serialized;
	gsize serialized_len, total = 0;
	struct iovec iov[7];

	g_hash_table_iter_init (&it, cache->re_classes);

	while (g_hash_table_iter_next (&it, &k, &v)) {
		re_class = v;
		rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir,
				G_DIR_SEPARATOR, re_class->hash);

		if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) {

			fd = open (path, O_RDONLY, 00600);

			/* Read number of regexps */
			g_assert (fd != -1);
			lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET);
			read (fd, &n, sizeof (n));
			close (fd);

			if (re_class->type_len > 0) {
				if (!silent) {
					msg_info_re_cache (
							"skip already valid class %s(%*s) to cache %6s, %d regexps",
							rspamd_re_cache_type_to_string (re_class->type),
							(gint) re_class->type_len - 1,
							re_class->type_data,
							re_class->hash,
							n);
				}
			}
			else {
				if (!silent) {
					msg_info_re_cache (
							"skip already valid class %s to cache %6s, %d regexps",
							rspamd_re_cache_type_to_string (re_class->type),
							re_class->hash,
							n);
				}
			}

			continue;
		}

		fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600);

		if (fd == -1) {
			g_set_error (err, rspamd_re_cache_quark (), errno, "cannot open file "
					"%s: %s", path, strerror (errno));
			return -1;
		}

		g_hash_table_iter_init (&cit, re_class->re);
		n = g_hash_table_size (re_class->re);
		hs_flags = g_malloc0 (sizeof (*hs_flags) * n);
		hs_ids = g_malloc (sizeof (*hs_ids) * n);
		hs_pats = g_malloc (sizeof (*hs_pats) * n);
		i = 0;

		while (g_hash_table_iter_next (&cit, &k, &v)) {
			re = v;

			pcre_flags = rspamd_regexp_get_pcre_flags (re);
			re_flags = rspamd_regexp_get_flags (re);

			if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) {
				/* Do not try to compile bad regexp */
				msg_info_re_cache (
						"do not try compile %s to hyperscan as it is PCRE only",
						rspamd_regexp_get_pattern (re));
				continue;
			}

			hs_flags[i] = 0;
#ifndef WITH_PCRE2
			if (pcre_flags & PCRE_FLAG(UTF8)) {
				hs_flags[i] |= HS_FLAG_UTF8;
			}
#else
			if (pcre_flags & PCRE_FLAG(UTF)) {
				hs_flags[i] |= HS_FLAG_UTF8;
			}
#endif
			if (pcre_flags & PCRE_FLAG(CASELESS)) {
				hs_flags[i] |= HS_FLAG_CASELESS;
			}
			if (pcre_flags & PCRE_FLAG(MULTILINE)) {
				hs_flags[i] |= HS_FLAG_MULTILINE;
			}
			if (pcre_flags & PCRE_FLAG(DOTALL)) {
				hs_flags[i] |= HS_FLAG_DOTALL;
			}
			if (rspamd_regexp_get_maxhits (re) == 1) {
				hs_flags[i] |= HS_FLAG_SINGLEMATCH;
			}

			if (hs_compile (rspamd_regexp_get_pattern (re),
					hs_flags[i],
					cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
					&cache->plt,
					&test_db,
					&hs_errors) != HS_SUCCESS) {
				msg_info_re_cache ("cannot compile %s to hyperscan, try prefilter match",
						rspamd_regexp_get_pattern (re));
				hs_free_compile_error (hs_errors);

				/* The approximation operation might take a significant
				 * amount of time, so we need to check if it's finite
				 */
				if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], max_time)) {
					hs_flags[i] |= HS_FLAG_PREFILTER;
					hs_ids[i] = rspamd_regexp_get_cache_id (re);
					hs_pats[i] = rspamd_regexp_get_pattern (re);
					i++;
				}
			}
			else {
				hs_ids[i] = rspamd_regexp_get_cache_id (re);
				hs_pats[i] = rspamd_regexp_get_pattern (re);
				i ++;
				hs_free_database (test_db);
			}
		}
		/* Adjust real re number */
		n = i;

		if (n > 0) {
			/* Create the hs tree */
			if (hs_compile_multi (hs_pats,
					hs_flags,
					hs_ids,
					n,
					cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
					&cache->plt,
					&test_db,
					&hs_errors) != HS_SUCCESS) {

				g_set_error (err, rspamd_re_cache_quark (), EINVAL,
						"cannot create tree of regexp when processing '%s': %s",
						hs_pats[hs_errors->expression], hs_errors->message);
				g_free (hs_flags);
				g_free (hs_ids);
				g_free (hs_pats);
				close (fd);
				hs_free_compile_error (hs_errors);

				return -1;
			}

			g_free (hs_pats);

			if (hs_serialize_database (test_db, &hs_serialized,
					&serialized_len) != HS_SUCCESS) {
				g_set_error (err,
						rspamd_re_cache_quark (),
						errno,
						"cannot serialize tree of regexp for %s",
						re_class->hash);

				close (fd);
				g_free (hs_ids);
				g_free (hs_flags);
				hs_free_database (test_db);

				return -1;
			}

			hs_free_database (test_db);

			/*
			 * Magic - 8 bytes
			 * Platform - sizeof (platform)
			 * n - number of regexps
			 * n * <regexp ids>
			 * n * <regexp flags>
			 * crc - 8 bytes checksum
			 * <hyperscan blob>
			 */
			crc = XXH64 (hs_serialized, serialized_len, 0xdeadbabe);

			if (cache->vectorized_hyperscan) {
				iov[0].iov_base = (void *) rspamd_hs_magic_vector;
			}
			else {
				iov[0].iov_base = (void *) rspamd_hs_magic;
			}
			iov[0].iov_len = RSPAMD_HS_MAGIC_LEN;
			iov[1].iov_base = &cache->plt;
			iov[1].iov_len = sizeof (cache->plt);
			iov[2].iov_base = &n;
			iov[2].iov_len = sizeof (n);
			iov[3].iov_base = hs_ids;
			iov[3].iov_len = sizeof (*hs_ids) * n;
			iov[4].iov_base = hs_flags;
			iov[4].iov_len = sizeof (*hs_flags) * n;
			iov[5].iov_base = &crc;
			iov[5].iov_len = sizeof (crc);
			iov[6].iov_base = hs_serialized;
			iov[6].iov_len = serialized_len;

			if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) {
				g_set_error (err,
						rspamd_re_cache_quark (),
						errno,
						"cannot serialize tree of regexp to %s: %s",
						path, strerror (errno));
				close (fd);
				g_free (hs_ids);
				g_free (hs_flags);
				g_free (hs_serialized);

				return -1;
			}

			if (re_class->type_len > 0) {
				msg_info_re_cache (
						"compiled class %s(%*s) to cache %6s, %d regexps",
						rspamd_re_cache_type_to_string (re_class->type),
						(gint) re_class->type_len - 1,
						re_class->type_data,
						re_class->hash,
						n);
			}
			else {
				msg_info_re_cache (
						"compiled class %s to cache %6s, %d regexps",
						rspamd_re_cache_type_to_string (re_class->type),
						re_class->hash,
						n);
			}

			total += n;

			g_free (hs_serialized);
			g_free (hs_ids);
			g_free (hs_flags);
		}

		close (fd);
	}

	return total;
#endif
}
Exemple #2
0
rspamd_regexp_t*
rspamd_regexp_new (const gchar *pattern, const gchar *flags,
		GError **err)
{
	const gchar *start = pattern, *end, *flags_str = NULL;
	gchar *err_str;
	rspamd_regexp_t *res;
	PCRE_T *r;
	gchar sep = 0, *real_pattern;
#ifndef WITH_PCRE2
	gint err_off;
#else
	gsize err_off;
#endif
	gint regexp_flags = 0, rspamd_flags = 0, err_code, ncaptures;
	gboolean strict_flags = FALSE;

	rspamd_regexp_library_init (NULL);

	if (flags == NULL) {
		/* We need to parse pattern and detect flags set */
		if (*start == '/') {
			sep = '/';
		}
		else if (*start == 'm') {
			start ++;
			sep = *start;

			/* Paired braces */
			if (sep == '{') {
				sep = '}';
			}

			rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH;
		}
		if (sep == '\0' || g_ascii_isalnum (sep)) {
			/* We have no flags, no separators and just use all line as expr */
			start = pattern;
			end = start + strlen (pattern);
			rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH;
		}
		else {
			end = strrchr (pattern, sep);

			if (end == NULL || end <= start) {
				g_set_error (err, rspamd_regexp_quark(), EINVAL,
						"pattern is not enclosed with %c: %s",
						sep, pattern);
				return NULL;
			}
			flags_str = end + 1;
			start ++;
		}
	}
	else {
		/* Strictly check all flags */
		strict_flags = TRUE;
		start = pattern;
		end = pattern + strlen (pattern);
		flags_str = flags;
	}

	rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW;

#ifndef WITH_PCRE2
	regexp_flags &= ~PCRE_FLAG(UTF8);
	regexp_flags |= PCRE_FLAG(NEWLINE_ANYCRLF);
#else
	regexp_flags &= ~PCRE_FLAG(UTF);
#endif

	if (flags_str != NULL) {
		while (*flags_str) {
			switch (*flags_str) {
			case 'i':
				regexp_flags |= PCRE_FLAG(CASELESS);
				break;
			case 'm':
				regexp_flags |= PCRE_FLAG(MULTILINE);
				break;
			case 's':
				regexp_flags |= PCRE_FLAG(DOTALL);
				break;
			case 'x':
				regexp_flags |= PCRE_FLAG(EXTENDED);
				break;
			case 'u':
				rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW;
#ifndef WITH_PCRE2
				regexp_flags |= PCRE_FLAG(UTF8);
#else
				regexp_flags |= PCRE_FLAG(UTF);
#endif
				break;
			case 'O':
				/* We optimize all regexps by default */
				rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT;
				break;
			case 'r':
				rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW;
#ifndef WITH_PCRE2
				regexp_flags &= ~PCRE_FLAG(UTF8);
#else
				regexp_flags &= ~PCRE_FLAG(UTF);
#endif
				break;
			default:
				if (strict_flags) {
					g_set_error (err, rspamd_regexp_quark(), EINVAL,
							"invalid regexp flag: %c in pattern %s",
							*flags_str, pattern);
					return NULL;
				}
				msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern);
				goto fin;
				break;
			}
			flags_str++;
		}
	}
fin:

	real_pattern = g_malloc (end - start + 1);
	rspamd_strlcpy (real_pattern, start, end - start + 1);

#ifndef WITH_PCRE2
	r = pcre_compile (real_pattern, regexp_flags,
			(const char **)&err_str, &err_off, NULL);
	(void)err_code;
#else
	r = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED,
			regexp_flags,
			&err_code, &err_off, pcre2_ctx);

	if (r == NULL) {
		err_str = g_alloca (1024);
		memset (err_str, 0, 1024);
		pcre2_get_error_message (err_code, err_str, 1024);
	}
#endif

	if (r == NULL) {
		g_set_error (err, rspamd_regexp_quark(), EINVAL,
			"regexp parsing error: '%s' at position %d",
			err_str, (gint)err_off);
		g_free (real_pattern);

		return NULL;
	}

	/* Now allocate the target structure */
	res = g_malloc0 (sizeof (*res));
	REF_INIT_RETAIN (res, rspamd_regexp_dtor);
	res->flags = rspamd_flags;
	res->pattern = real_pattern;
	res->cache_id = RSPAMD_INVALID_ID;
	res->pcre_flags = regexp_flags;
	res->max_hits = 0;
	res->re = r;

	if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) {
		res->raw_re = r;
	}
	else {
#ifndef WITH_PCRE2
		res->raw_re = pcre_compile (real_pattern, regexp_flags & ~PCRE_FLAG(UTF8),
				(const char **)&err_str, &err_off, NULL);
		(void)err_code;
#else
		res->raw_re = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED,
					regexp_flags & ~PCRE_FLAG(UTF),
					&err_code, &err_off, pcre2_ctx);
		if (res->raw_re == NULL) {
			err_str = g_alloca (1024);
			memset (err_str, 0, 1024);
			pcre2_get_error_message (err_code, err_str, 1024);
		}
#endif
		if (res->raw_re == NULL) {
			msg_warn ("raw regexp parsing error: '%s': '%s' at position %d",
					err_str, real_pattern, (gint)err_off);
		}
	}

	rspamd_regexp_post_process (res);
	rspamd_regexp_generate_id (pattern, flags, res->id);

#ifndef WITH_PCRE2
	/* Check number of captures */
	if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT,
			&ncaptures) == 0) {
		res->ncaptures = ncaptures;
	}

	/* Check number of backrefs */
	if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_BACKREFMAX,
			&ncaptures) == 0) {
		res->nbackref = ncaptures;
	}
#else
	/* Check number of captures */
	if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_CAPTURECOUNT,
			&ncaptures) == 0) {
		res->ncaptures = ncaptures;
	}

	/* Check number of backrefs */
	if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_BACKREFMAX,
			&ncaptures) == 0) {
		res->nbackref = ncaptures;
	}
#endif

	return res;
}