gint rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache, const char *cache_dir, gdouble max_time, gboolean silent, GError **err) { g_assert (cache != NULL); g_assert (cache_dir != NULL); #ifndef WITH_HYPERSCAN g_set_error (err, rspamd_re_cache_quark (), EINVAL, "hyperscan is disabled"); return -1; #else GHashTableIter it, cit; gpointer k, v; struct rspamd_re_class *re_class; gchar path[PATH_MAX]; hs_database_t *test_db; gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags; guint64 crc; rspamd_regexp_t *re; hs_compile_error_t *hs_errors; guint *hs_flags = NULL; const gchar **hs_pats = NULL; gchar *hs_serialized; gsize serialized_len, total = 0; struct iovec iov[7]; g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) { fd = open (path, O_RDONLY, 00600); /* Read number of regexps */ g_assert (fd != -1); lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET); read (fd, &n, sizeof (n)); close (fd); if (re_class->type_len > 0) { if (!silent) { msg_info_re_cache ( "skip already valid class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } } else { if (!silent) { msg_info_re_cache ( "skip already valid class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } } continue; } fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600); if (fd == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot open file " "%s: %s", path, strerror (errno)); return -1; } g_hash_table_iter_init (&cit, re_class->re); n = g_hash_table_size (re_class->re); hs_flags = g_malloc0 (sizeof (*hs_flags) * n); hs_ids = g_malloc (sizeof (*hs_ids) * n); hs_pats = g_malloc (sizeof (*hs_pats) * n); i = 0; while (g_hash_table_iter_next (&cit, &k, &v)) { re = v; pcre_flags = rspamd_regexp_get_pcre_flags (re); re_flags = rspamd_regexp_get_flags (re); if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) { /* Do not try to compile bad regexp */ msg_info_re_cache ( "do not try compile %s to hyperscan as it is PCRE only", rspamd_regexp_get_pattern (re)); continue; } hs_flags[i] = 0; #ifndef WITH_PCRE2 if (pcre_flags & PCRE_FLAG(UTF8)) { hs_flags[i] |= HS_FLAG_UTF8; } #else if (pcre_flags & PCRE_FLAG(UTF)) { hs_flags[i] |= HS_FLAG_UTF8; } #endif if (pcre_flags & PCRE_FLAG(CASELESS)) { hs_flags[i] |= HS_FLAG_CASELESS; } if (pcre_flags & PCRE_FLAG(MULTILINE)) { hs_flags[i] |= HS_FLAG_MULTILINE; } if (pcre_flags & PCRE_FLAG(DOTALL)) { hs_flags[i] |= HS_FLAG_DOTALL; } if (rspamd_regexp_get_maxhits (re) == 1) { hs_flags[i] |= HS_FLAG_SINGLEMATCH; } if (hs_compile (rspamd_regexp_get_pattern (re), hs_flags[i], cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { msg_info_re_cache ("cannot compile %s to hyperscan, try prefilter match", rspamd_regexp_get_pattern (re)); hs_free_compile_error (hs_errors); /* The approximation operation might take a significant * amount of time, so we need to check if it's finite */ if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], max_time)) { hs_flags[i] |= HS_FLAG_PREFILTER; hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i++; } } else { hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i ++; hs_free_database (test_db); } } /* Adjust real re number */ n = i; if (n > 0) { /* Create the hs tree */ if (hs_compile_multi (hs_pats, hs_flags, hs_ids, n, cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), EINVAL, "cannot create tree of regexp when processing '%s': %s", hs_pats[hs_errors->expression], hs_errors->message); g_free (hs_flags); g_free (hs_ids); g_free (hs_pats); close (fd); hs_free_compile_error (hs_errors); return -1; } g_free (hs_pats); if (hs_serialize_database (test_db, &hs_serialized, &serialized_len) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp for %s", re_class->hash); close (fd); g_free (hs_ids); g_free (hs_flags); hs_free_database (test_db); return -1; } hs_free_database (test_db); /* * Magic - 8 bytes * Platform - sizeof (platform) * n - number of regexps * n * <regexp ids> * n * <regexp flags> * crc - 8 bytes checksum * <hyperscan blob> */ crc = XXH64 (hs_serialized, serialized_len, 0xdeadbabe); if (cache->vectorized_hyperscan) { iov[0].iov_base = (void *) rspamd_hs_magic_vector; } else { iov[0].iov_base = (void *) rspamd_hs_magic; } iov[0].iov_len = RSPAMD_HS_MAGIC_LEN; iov[1].iov_base = &cache->plt; iov[1].iov_len = sizeof (cache->plt); iov[2].iov_base = &n; iov[2].iov_len = sizeof (n); iov[3].iov_base = hs_ids; iov[3].iov_len = sizeof (*hs_ids) * n; iov[4].iov_base = hs_flags; iov[4].iov_len = sizeof (*hs_flags) * n; iov[5].iov_base = &crc; iov[5].iov_len = sizeof (crc); iov[6].iov_base = hs_serialized; iov[6].iov_len = serialized_len; if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp to %s: %s", path, strerror (errno)); close (fd); g_free (hs_ids); g_free (hs_flags); g_free (hs_serialized); return -1; } if (re_class->type_len > 0) { msg_info_re_cache ( "compiled class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } else { msg_info_re_cache ( "compiled class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } total += n; g_free (hs_serialized); g_free (hs_ids); g_free (hs_flags); } close (fd); } return total; #endif }
rspamd_regexp_t* rspamd_regexp_new (const gchar *pattern, const gchar *flags, GError **err) { const gchar *start = pattern, *end, *flags_str = NULL; gchar *err_str; rspamd_regexp_t *res; PCRE_T *r; gchar sep = 0, *real_pattern; #ifndef WITH_PCRE2 gint err_off; #else gsize err_off; #endif gint regexp_flags = 0, rspamd_flags = 0, err_code, ncaptures; gboolean strict_flags = FALSE; rspamd_regexp_library_init (NULL); if (flags == NULL) { /* We need to parse pattern and detect flags set */ if (*start == '/') { sep = '/'; } else if (*start == 'm') { start ++; sep = *start; /* Paired braces */ if (sep == '{') { sep = '}'; } rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH; } if (sep == '\0' || g_ascii_isalnum (sep)) { /* We have no flags, no separators and just use all line as expr */ start = pattern; end = start + strlen (pattern); rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH; } else { end = strrchr (pattern, sep); if (end == NULL || end <= start) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "pattern is not enclosed with %c: %s", sep, pattern); return NULL; } flags_str = end + 1; start ++; } } else { /* Strictly check all flags */ strict_flags = TRUE; start = pattern; end = pattern + strlen (pattern); flags_str = flags; } rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags &= ~PCRE_FLAG(UTF8); regexp_flags |= PCRE_FLAG(NEWLINE_ANYCRLF); #else regexp_flags &= ~PCRE_FLAG(UTF); #endif if (flags_str != NULL) { while (*flags_str) { switch (*flags_str) { case 'i': regexp_flags |= PCRE_FLAG(CASELESS); break; case 'm': regexp_flags |= PCRE_FLAG(MULTILINE); break; case 's': regexp_flags |= PCRE_FLAG(DOTALL); break; case 'x': regexp_flags |= PCRE_FLAG(EXTENDED); break; case 'u': rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags |= PCRE_FLAG(UTF8); #else regexp_flags |= PCRE_FLAG(UTF); #endif break; case 'O': /* We optimize all regexps by default */ rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT; break; case 'r': rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags &= ~PCRE_FLAG(UTF8); #else regexp_flags &= ~PCRE_FLAG(UTF); #endif break; default: if (strict_flags) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp flag: %c in pattern %s", *flags_str, pattern); return NULL; } msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern); goto fin; break; } flags_str++; } } fin: real_pattern = g_malloc (end - start + 1); rspamd_strlcpy (real_pattern, start, end - start + 1); #ifndef WITH_PCRE2 r = pcre_compile (real_pattern, regexp_flags, (const char **)&err_str, &err_off, NULL); (void)err_code; #else r = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED, regexp_flags, &err_code, &err_off, pcre2_ctx); if (r == NULL) { err_str = g_alloca (1024); memset (err_str, 0, 1024); pcre2_get_error_message (err_code, err_str, 1024); } #endif if (r == NULL) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "regexp parsing error: '%s' at position %d", err_str, (gint)err_off); g_free (real_pattern); return NULL; } /* Now allocate the target structure */ res = g_malloc0 (sizeof (*res)); REF_INIT_RETAIN (res, rspamd_regexp_dtor); res->flags = rspamd_flags; res->pattern = real_pattern; res->cache_id = RSPAMD_INVALID_ID; res->pcre_flags = regexp_flags; res->max_hits = 0; res->re = r; if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) { res->raw_re = r; } else { #ifndef WITH_PCRE2 res->raw_re = pcre_compile (real_pattern, regexp_flags & ~PCRE_FLAG(UTF8), (const char **)&err_str, &err_off, NULL); (void)err_code; #else res->raw_re = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED, regexp_flags & ~PCRE_FLAG(UTF), &err_code, &err_off, pcre2_ctx); if (res->raw_re == NULL) { err_str = g_alloca (1024); memset (err_str, 0, 1024); pcre2_get_error_message (err_code, err_str, 1024); } #endif if (res->raw_re == NULL) { msg_warn ("raw regexp parsing error: '%s': '%s' at position %d", err_str, real_pattern, (gint)err_off); } } rspamd_regexp_post_process (res); rspamd_regexp_generate_id (pattern, flags, res->id); #ifndef WITH_PCRE2 /* Check number of captures */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } /* Check number of backrefs */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_BACKREFMAX, &ncaptures) == 0) { res->nbackref = ncaptures; } #else /* Check number of captures */ if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } /* Check number of backrefs */ if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_BACKREFMAX, &ncaptures) == 0) { res->nbackref = ncaptures; } #endif return res; }