rspamd_regexp_t* rspamd_regexp_cache_create (struct rspamd_regexp_cache *cache, const gchar *pattern, const gchar *flags, GError **err) { rspamd_regexp_t *res; if (cache == NULL) { rspamd_regexp_library_init (); cache = global_re_cache; } g_assert (cache != NULL); res = rspamd_regexp_cache_query (cache, pattern, flags); if (res != NULL) { return res; } res = rspamd_regexp_new (pattern, flags, err); if (res) { REF_RETAIN (res); g_hash_table_insert (cache->tbl, res->id, res); } return res; }
void rspamd_regexp_cache_insert (struct rspamd_regexp_cache* cache, const gchar *pattern, const gchar *flags, rspamd_regexp_t *re) { g_assert (re != NULL); g_assert (pattern != NULL); if (cache == NULL) { rspamd_regexp_library_init (); cache = global_re_cache; } g_assert (cache != NULL); /* Generate custom id */ rspamd_regexp_generate_id (pattern, flags, re->id); REF_RETAIN (re); g_hash_table_insert (cache->tbl, re->id, re); }
rspamd_regexp_t* rspamd_regexp_cache_query (struct rspamd_regexp_cache* cache, const gchar *pattern, const gchar *flags) { rspamd_regexp_t *res = NULL; regexp_id_t id; if (cache == NULL) { rspamd_regexp_library_init (); cache = global_re_cache; } g_assert (cache != NULL); rspamd_regexp_generate_id (pattern, flags, id); res = g_hash_table_lookup (cache->tbl, id); return res; }
rspamd_regexp_t* rspamd_regexp_new (const gchar *pattern, const gchar *flags, GError **err) { const gchar *start = pattern, *end, *flags_str = NULL, *err_str; rspamd_regexp_t *res; pcre *r; gchar sep = 0, *real_pattern; gint regexp_flags = 0, rspamd_flags = 0, err_off, study_flags = 0, ncaptures; gboolean strict_flags = FALSE; rspamd_regexp_library_init (); if (flags == NULL) { /* We need to parse pattern and detect flags set */ if (*start == '/') { sep = '/'; } else if (*start == 'm') { start ++; sep = *start; /* Paired braces */ if (sep == '{') { sep = '}'; } rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH; } if (sep == '\0' || g_ascii_isalnum (sep)) { /* We have no flags, no separators and just use all line as expr */ start = pattern; end = start + strlen (pattern); rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH; } else { end = strrchr (pattern, sep); if (end == NULL || end <= start) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "pattern is not enclosed with %c: %s", sep, pattern); return NULL; } flags_str = end + 1; start ++; } } else { /* Strictly check all flags */ strict_flags = TRUE; start = pattern; end = pattern + strlen (pattern); flags_str = flags; } rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; regexp_flags &= ~PCRE_UTF8; if (flags_str != NULL) { while (*flags_str) { switch (*flags_str) { case 'i': regexp_flags |= PCRE_CASELESS; break; case 'm': regexp_flags |= PCRE_MULTILINE; break; case 's': regexp_flags |= PCRE_DOTALL; break; case 'x': regexp_flags |= PCRE_EXTENDED; break; case 'u': rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW; regexp_flags |= PCRE_UTF8; break; case 'O': /* We optimize all regexps by default */ rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT; break; case 'r': rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; regexp_flags &= ~PCRE_UTF8; break; default: if (strict_flags) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp flag: %c in pattern %s", *flags_str, pattern); return NULL; } msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern); goto fin; break; } flags_str++; } } fin: real_pattern = g_malloc (end - start + 1); rspamd_strlcpy (real_pattern, start, end - start + 1); r = pcre_compile (real_pattern, regexp_flags, &err_str, &err_off, NULL); if (r == NULL) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp pattern: '%s': %s at position %d", pattern, err_str, err_off); g_free (real_pattern); return NULL; } /* Now allocate the target structure */ res = g_slice_alloc0 (sizeof (*res)); REF_INIT_RETAIN (res, rspamd_regexp_dtor); res->flags = rspamd_flags; res->pattern = real_pattern; if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) { res->raw_re = r; } else { res->re = r; res->raw_re = pcre_compile (pattern, regexp_flags & ~PCRE_UTF8, &err_str, &err_off, NULL); if (res->raw_re == NULL) { msg_warn ("invalid raw regexp pattern: '%s': %s at position %d", pattern, err_str, err_off); } } #ifdef HAVE_PCRE_JIT study_flags |= PCRE_STUDY_JIT_COMPILE; #endif if (!(rspamd_flags & RSPAMD_REGEXP_FLAG_NOOPT)) { /* Optimize regexp */ if (res->re) { res->extra = pcre_study (res->re, study_flags, &err_str); if (res->extra != NULL) { #ifdef HAVE_PCRE_JIT gint jit, n; if (can_jit) { jit = 0; n = pcre_fullinfo (res->re, res->extra, PCRE_INFO_JIT, &jit); if (n != 0 || jit != 1) { msg_debug ("jit compilation of %s is not supported", pattern); res->jstack = NULL; } else { res->jstack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024); pcre_assign_jit_stack (res->extra, NULL, res->jstack); } } #endif } else { msg_warn ("cannot optimize regexp pattern: '%s': %s", pattern, err_str); } } if (res->raw_re) { if (res->raw_re != res->re) { res->raw_extra = pcre_study (res->raw_re, study_flags, &err_str); if (res->raw_extra != NULL) { #ifdef HAVE_PCRE_JIT gint jit, n; if (can_jit) { jit = 0; n = pcre_fullinfo (res->raw_re, res->raw_extra, PCRE_INFO_JIT, &jit); if (n != 0 || jit != 1) { msg_debug ("jit compilation of %s is not supported", pattern); res->raw_jstack = NULL; } else { res->raw_jstack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024); pcre_assign_jit_stack (res->raw_extra, NULL, res->raw_jstack); } } #endif } else { msg_warn ("cannot optimize raw regexp pattern: '%s': %s", pattern, err_str); } } else { #ifdef HAVE_PCRE_JIT /* Just alias pointers */ res->raw_extra = res->extra; res->raw_jstack = res->jstack; #endif } } } rspamd_regexp_generate_id (pattern, flags, res->id); /* Check number of captures */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } return res; }
/* * Perform post load actions */ void rspamd_config_post_load (struct rspamd_config *cfg) { #ifdef HAVE_CLOCK_GETTIME struct timespec ts; #endif struct metric *def_metric; #ifdef HAVE_CLOCK_GETTIME #ifdef HAVE_CLOCK_PROCESS_CPUTIME_ID clock_getres (CLOCK_PROCESS_CPUTIME_ID, &ts); # elif defined(HAVE_CLOCK_VIRTUAL) clock_getres (CLOCK_VIRTUAL, &ts); # else clock_getres (CLOCK_REALTIME, &ts); # endif cfg->clock_res = log10 (1000000. / ts.tv_nsec); if (cfg->clock_res < 0) { cfg->clock_res = 0; } if (cfg->clock_res > 3) { cfg->clock_res = 3; } #else /* For gettimeofday */ cfg->clock_res = 1; #endif rspamd_regexp_library_init (); if ((def_metric = g_hash_table_lookup (cfg->metrics, DEFAULT_METRIC)) == NULL) { def_metric = rspamd_config_new_metric (cfg, NULL, DEFAULT_METRIC); def_metric->actions[METRIC_ACTION_REJECT].score = DEFAULT_SCORE; } if (cfg->tld_file == NULL) { /* Try to guess tld file */ GString *fpath = g_string_new (NULL); rspamd_printf_gstring (fpath, "%s%c%s", RSPAMD_PLUGINSDIR, G_DIR_SEPARATOR, "effective_tld_names.dat"); if (access (fpath->str, R_OK)) { msg_warn_config ("url_tld option is not specified but %s is available," " therefore this file is assumed as TLD file for URL" " extraction", fpath->str); cfg->tld_file = rspamd_mempool_strdup (cfg->cfg_pool, fpath->str); } else { msg_err_config ("no url_tld option has been specified, URL's detection " "will be awfully broken"); } g_string_free (fpath, TRUE); } /* Lua options */ (void)rspamd_lua_post_load_config (cfg); init_dynamic_config (cfg); rspamd_url_init (cfg->tld_file); /* Insert classifiers symbols */ (void)rspamd_config_insert_classify_symbols (cfg); }
rspamd_regexp_t* rspamd_regexp_new (const gchar *pattern, const gchar *flags, GError **err) { const gchar *start = pattern, *end, *flags_str = NULL; gchar *err_str; rspamd_regexp_t *res; PCRE_T *r; gchar sep = 0, *real_pattern; #ifndef WITH_PCRE2 gint err_off; #else gsize err_off; #endif gint regexp_flags = 0, rspamd_flags = 0, err_code, ncaptures; gboolean strict_flags = FALSE; rspamd_regexp_library_init (NULL); if (flags == NULL) { /* We need to parse pattern and detect flags set */ if (*start == '/') { sep = '/'; } else if (*start == 'm') { start ++; sep = *start; /* Paired braces */ if (sep == '{') { sep = '}'; } rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH; } if (sep == '\0' || g_ascii_isalnum (sep)) { /* We have no flags, no separators and just use all line as expr */ start = pattern; end = start + strlen (pattern); rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH; } else { end = strrchr (pattern, sep); if (end == NULL || end <= start) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "pattern is not enclosed with %c: %s", sep, pattern); return NULL; } flags_str = end + 1; start ++; } } else { /* Strictly check all flags */ strict_flags = TRUE; start = pattern; end = pattern + strlen (pattern); flags_str = flags; } rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags &= ~PCRE_FLAG(UTF8); regexp_flags |= PCRE_FLAG(NEWLINE_ANYCRLF); #else regexp_flags &= ~PCRE_FLAG(UTF); #endif if (flags_str != NULL) { while (*flags_str) { switch (*flags_str) { case 'i': regexp_flags |= PCRE_FLAG(CASELESS); break; case 'm': regexp_flags |= PCRE_FLAG(MULTILINE); break; case 's': regexp_flags |= PCRE_FLAG(DOTALL); break; case 'x': regexp_flags |= PCRE_FLAG(EXTENDED); break; case 'u': rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags |= PCRE_FLAG(UTF8); #else regexp_flags |= PCRE_FLAG(UTF); #endif break; case 'O': /* We optimize all regexps by default */ rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT; break; case 'r': rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags &= ~PCRE_FLAG(UTF8); #else regexp_flags &= ~PCRE_FLAG(UTF); #endif break; default: if (strict_flags) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp flag: %c in pattern %s", *flags_str, pattern); return NULL; } msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern); goto fin; break; } flags_str++; } } fin: real_pattern = g_malloc (end - start + 1); rspamd_strlcpy (real_pattern, start, end - start + 1); #ifndef WITH_PCRE2 r = pcre_compile (real_pattern, regexp_flags, (const char **)&err_str, &err_off, NULL); (void)err_code; #else r = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED, regexp_flags, &err_code, &err_off, pcre2_ctx); if (r == NULL) { err_str = g_alloca (1024); memset (err_str, 0, 1024); pcre2_get_error_message (err_code, err_str, 1024); } #endif if (r == NULL) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "regexp parsing error: '%s' at position %d", err_str, (gint)err_off); g_free (real_pattern); return NULL; } /* Now allocate the target structure */ res = g_malloc0 (sizeof (*res)); REF_INIT_RETAIN (res, rspamd_regexp_dtor); res->flags = rspamd_flags; res->pattern = real_pattern; res->cache_id = RSPAMD_INVALID_ID; res->pcre_flags = regexp_flags; res->max_hits = 0; res->re = r; if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) { res->raw_re = r; } else { #ifndef WITH_PCRE2 res->raw_re = pcre_compile (real_pattern, regexp_flags & ~PCRE_FLAG(UTF8), (const char **)&err_str, &err_off, NULL); (void)err_code; #else res->raw_re = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED, regexp_flags & ~PCRE_FLAG(UTF), &err_code, &err_off, pcre2_ctx); if (res->raw_re == NULL) { err_str = g_alloca (1024); memset (err_str, 0, 1024); pcre2_get_error_message (err_code, err_str, 1024); } #endif if (res->raw_re == NULL) { msg_warn ("raw regexp parsing error: '%s': '%s' at position %d", err_str, real_pattern, (gint)err_off); } } rspamd_regexp_post_process (res); rspamd_regexp_generate_id (pattern, flags, res->id); #ifndef WITH_PCRE2 /* Check number of captures */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } /* Check number of backrefs */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_BACKREFMAX, &ncaptures) == 0) { res->nbackref = ncaptures; } #else /* Check number of captures */ if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } /* Check number of backrefs */ if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_BACKREFMAX, &ncaptures) == 0) { res->nbackref = ncaptures; } #endif return res; }
static void rspamd_regexp_post_process (rspamd_regexp_t *r) { if (global_re_cache == NULL) { rspamd_regexp_library_init (NULL); } #if defined(WITH_PCRE2) gsize jsz; guint jit_flags = PCRE2_JIT_COMPLETE; /* Create match context */ r->mcontext = pcre2_match_context_create (NULL); if (r->re != r->raw_re) { r->raw_mcontext = pcre2_match_context_create (NULL); } else { r->raw_mcontext = r->mcontext; } #ifdef HAVE_PCRE_JIT if (pcre2_jit_compile (r->re, jit_flags) < 0) { msg_err ("jit compilation of %s is not supported: %d", r->pattern, jit_flags); r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } else { if (!(pcre2_pattern_info (r->re, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0)) { msg_err ("jit compilation of %s is not supported", r->pattern); r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } } if (!(r->flags & RSPAMD_REGEXP_FLAG_DISABLE_JIT)) { pcre2_jit_stack_assign (r->mcontext, NULL, global_re_cache->jstack); } if (r->re != r->raw_re) { if (pcre2_jit_compile (r->raw_re, jit_flags) < 0) { msg_debug ("jit compilation of %s is not supported", r->pattern); r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } if (!(pcre2_pattern_info (r->raw_re, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0)) { msg_debug ("jit compilation of raw %s is not supported", r->pattern); } else if (!(r->flags & RSPAMD_REGEXP_FLAG_DISABLE_JIT)) { pcre2_jit_stack_assign (r->raw_mcontext, NULL, global_re_cache->jstack); } } #endif #else const gchar *err_str = "unknown"; gboolean try_jit = TRUE, try_raw_jit = TRUE; gint study_flags = 0; #if defined(HAVE_PCRE_JIT) study_flags |= PCRE_STUDY_JIT_COMPILE; #endif /* Pcre 1 needs study */ if (r->re) { r->extra = pcre_study (r->re, study_flags, &err_str); if (r->extra == NULL) { msg_debug ("cannot optimize regexp pattern: '%s': %s", r->pattern, err_str); try_jit = FALSE; r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } } else { g_assert_not_reached (); } if (r->raw_re && r->raw_re != r->re) { r->raw_extra = pcre_study (r->re, study_flags, &err_str); } else if (r->raw_re == r->re) { r->raw_extra = r->extra; } if (r->raw_extra == NULL) { msg_debug ("cannot optimize raw regexp pattern: '%s': %s", r->pattern, err_str); try_raw_jit = FALSE; } /* JIT path */ if (try_jit) { #ifdef HAVE_PCRE_JIT gint jit, n; if (can_jit) { jit = 0; n = pcre_fullinfo (r->re, r->extra, PCRE_INFO_JIT, &jit); if (n != 0 || jit != 1) { msg_debug ("jit compilation of %s is not supported", r->pattern); r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } else { pcre_assign_jit_stack (r->extra, NULL, global_re_cache->jstack); } } #endif } else { msg_debug ("cannot optimize regexp pattern: '%s': %s", r->pattern, err_str); r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } if (try_raw_jit) { #ifdef HAVE_PCRE_JIT gint jit, n; if (can_jit) { if (r->raw_re != r->re) { jit = 0; n = pcre_fullinfo (r->raw_re, r->raw_extra, PCRE_INFO_JIT, &jit); if (n != 0 || jit != 1) { msg_debug ("jit compilation of %s is not supported", r->pattern); r->flags |= RSPAMD_REGEXP_FLAG_DISABLE_JIT; } else { pcre_assign_jit_stack (r->raw_extra, NULL, global_re_cache->jstack); } } } #endif } #endif /* WITH_PCRE2 */ }