void rspamd_regexp_cache_insert (struct rspamd_regexp_cache* cache, const gchar *pattern, const gchar *flags, rspamd_regexp_t *re) { g_assert (re != NULL); g_assert (pattern != NULL); if (cache == NULL) { rspamd_regexp_library_init (); cache = global_re_cache; } g_assert (cache != NULL); /* Generate custom id */ rspamd_regexp_generate_id (pattern, flags, re->id); REF_RETAIN (re); g_hash_table_insert (cache->tbl, re->id, re); }
rspamd_regexp_t* rspamd_regexp_cache_query (struct rspamd_regexp_cache* cache, const gchar *pattern, const gchar *flags) { rspamd_regexp_t *res = NULL; regexp_id_t id; if (cache == NULL) { rspamd_regexp_library_init (); cache = global_re_cache; } g_assert (cache != NULL); rspamd_regexp_generate_id (pattern, flags, id); res = g_hash_table_lookup (cache->tbl, id); return res; }
rspamd_regexp_t* rspamd_regexp_new (const gchar *pattern, const gchar *flags, GError **err) { const gchar *start = pattern, *end, *flags_str = NULL, *err_str; rspamd_regexp_t *res; pcre *r; gchar sep = 0, *real_pattern; gint regexp_flags = 0, rspamd_flags = 0, err_off, study_flags = 0, ncaptures; gboolean strict_flags = FALSE; rspamd_regexp_library_init (); if (flags == NULL) { /* We need to parse pattern and detect flags set */ if (*start == '/') { sep = '/'; } else if (*start == 'm') { start ++; sep = *start; /* Paired braces */ if (sep == '{') { sep = '}'; } rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH; } if (sep == '\0' || g_ascii_isalnum (sep)) { /* We have no flags, no separators and just use all line as expr */ start = pattern; end = start + strlen (pattern); rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH; } else { end = strrchr (pattern, sep); if (end == NULL || end <= start) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "pattern is not enclosed with %c: %s", sep, pattern); return NULL; } flags_str = end + 1; start ++; } } else { /* Strictly check all flags */ strict_flags = TRUE; start = pattern; end = pattern + strlen (pattern); flags_str = flags; } rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; regexp_flags &= ~PCRE_UTF8; if (flags_str != NULL) { while (*flags_str) { switch (*flags_str) { case 'i': regexp_flags |= PCRE_CASELESS; break; case 'm': regexp_flags |= PCRE_MULTILINE; break; case 's': regexp_flags |= PCRE_DOTALL; break; case 'x': regexp_flags |= PCRE_EXTENDED; break; case 'u': rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW; regexp_flags |= PCRE_UTF8; break; case 'O': /* We optimize all regexps by default */ rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT; break; case 'r': rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; regexp_flags &= ~PCRE_UTF8; break; default: if (strict_flags) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp flag: %c in pattern %s", *flags_str, pattern); return NULL; } msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern); goto fin; break; } flags_str++; } } fin: real_pattern = g_malloc (end - start + 1); rspamd_strlcpy (real_pattern, start, end - start + 1); r = pcre_compile (real_pattern, regexp_flags, &err_str, &err_off, NULL); if (r == NULL) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp pattern: '%s': %s at position %d", pattern, err_str, err_off); g_free (real_pattern); return NULL; } /* Now allocate the target structure */ res = g_slice_alloc0 (sizeof (*res)); REF_INIT_RETAIN (res, rspamd_regexp_dtor); res->flags = rspamd_flags; res->pattern = real_pattern; if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) { res->raw_re = r; } else { res->re = r; res->raw_re = pcre_compile (pattern, regexp_flags & ~PCRE_UTF8, &err_str, &err_off, NULL); if (res->raw_re == NULL) { msg_warn ("invalid raw regexp pattern: '%s': %s at position %d", pattern, err_str, err_off); } } #ifdef HAVE_PCRE_JIT study_flags |= PCRE_STUDY_JIT_COMPILE; #endif if (!(rspamd_flags & RSPAMD_REGEXP_FLAG_NOOPT)) { /* Optimize regexp */ if (res->re) { res->extra = pcre_study (res->re, study_flags, &err_str); if (res->extra != NULL) { #ifdef HAVE_PCRE_JIT gint jit, n; if (can_jit) { jit = 0; n = pcre_fullinfo (res->re, res->extra, PCRE_INFO_JIT, &jit); if (n != 0 || jit != 1) { msg_debug ("jit compilation of %s is not supported", pattern); res->jstack = NULL; } else { res->jstack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024); pcre_assign_jit_stack (res->extra, NULL, res->jstack); } } #endif } else { msg_warn ("cannot optimize regexp pattern: '%s': %s", pattern, err_str); } } if (res->raw_re) { if (res->raw_re != res->re) { res->raw_extra = pcre_study (res->raw_re, study_flags, &err_str); if (res->raw_extra != NULL) { #ifdef HAVE_PCRE_JIT gint jit, n; if (can_jit) { jit = 0; n = pcre_fullinfo (res->raw_re, res->raw_extra, PCRE_INFO_JIT, &jit); if (n != 0 || jit != 1) { msg_debug ("jit compilation of %s is not supported", pattern); res->raw_jstack = NULL; } else { res->raw_jstack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024); pcre_assign_jit_stack (res->raw_extra, NULL, res->raw_jstack); } } #endif } else { msg_warn ("cannot optimize raw regexp pattern: '%s': %s", pattern, err_str); } } else { #ifdef HAVE_PCRE_JIT /* Just alias pointers */ res->raw_extra = res->extra; res->raw_jstack = res->jstack; #endif } } } rspamd_regexp_generate_id (pattern, flags, res->id); /* Check number of captures */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } return res; }
rspamd_regexp_t* rspamd_regexp_new (const gchar *pattern, const gchar *flags, GError **err) { const gchar *start = pattern, *end, *flags_str = NULL; gchar *err_str; rspamd_regexp_t *res; PCRE_T *r; gchar sep = 0, *real_pattern; #ifndef WITH_PCRE2 gint err_off; #else gsize err_off; #endif gint regexp_flags = 0, rspamd_flags = 0, err_code, ncaptures; gboolean strict_flags = FALSE; rspamd_regexp_library_init (NULL); if (flags == NULL) { /* We need to parse pattern and detect flags set */ if (*start == '/') { sep = '/'; } else if (*start == 'm') { start ++; sep = *start; /* Paired braces */ if (sep == '{') { sep = '}'; } rspamd_flags |= RSPAMD_REGEXP_FLAG_FULL_MATCH; } if (sep == '\0' || g_ascii_isalnum (sep)) { /* We have no flags, no separators and just use all line as expr */ start = pattern; end = start + strlen (pattern); rspamd_flags &= ~RSPAMD_REGEXP_FLAG_FULL_MATCH; } else { end = strrchr (pattern, sep); if (end == NULL || end <= start) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "pattern is not enclosed with %c: %s", sep, pattern); return NULL; } flags_str = end + 1; start ++; } } else { /* Strictly check all flags */ strict_flags = TRUE; start = pattern; end = pattern + strlen (pattern); flags_str = flags; } rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags &= ~PCRE_FLAG(UTF8); regexp_flags |= PCRE_FLAG(NEWLINE_ANYCRLF); #else regexp_flags &= ~PCRE_FLAG(UTF); #endif if (flags_str != NULL) { while (*flags_str) { switch (*flags_str) { case 'i': regexp_flags |= PCRE_FLAG(CASELESS); break; case 'm': regexp_flags |= PCRE_FLAG(MULTILINE); break; case 's': regexp_flags |= PCRE_FLAG(DOTALL); break; case 'x': regexp_flags |= PCRE_FLAG(EXTENDED); break; case 'u': rspamd_flags &= ~RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags |= PCRE_FLAG(UTF8); #else regexp_flags |= PCRE_FLAG(UTF); #endif break; case 'O': /* We optimize all regexps by default */ rspamd_flags |= RSPAMD_REGEXP_FLAG_NOOPT; break; case 'r': rspamd_flags |= RSPAMD_REGEXP_FLAG_RAW; #ifndef WITH_PCRE2 regexp_flags &= ~PCRE_FLAG(UTF8); #else regexp_flags &= ~PCRE_FLAG(UTF); #endif break; default: if (strict_flags) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "invalid regexp flag: %c in pattern %s", *flags_str, pattern); return NULL; } msg_warn ("invalid flag '%c' in pattern %s", *flags_str, pattern); goto fin; break; } flags_str++; } } fin: real_pattern = g_malloc (end - start + 1); rspamd_strlcpy (real_pattern, start, end - start + 1); #ifndef WITH_PCRE2 r = pcre_compile (real_pattern, regexp_flags, (const char **)&err_str, &err_off, NULL); (void)err_code; #else r = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED, regexp_flags, &err_code, &err_off, pcre2_ctx); if (r == NULL) { err_str = g_alloca (1024); memset (err_str, 0, 1024); pcre2_get_error_message (err_code, err_str, 1024); } #endif if (r == NULL) { g_set_error (err, rspamd_regexp_quark(), EINVAL, "regexp parsing error: '%s' at position %d", err_str, (gint)err_off); g_free (real_pattern); return NULL; } /* Now allocate the target structure */ res = g_malloc0 (sizeof (*res)); REF_INIT_RETAIN (res, rspamd_regexp_dtor); res->flags = rspamd_flags; res->pattern = real_pattern; res->cache_id = RSPAMD_INVALID_ID; res->pcre_flags = regexp_flags; res->max_hits = 0; res->re = r; if (rspamd_flags & RSPAMD_REGEXP_FLAG_RAW) { res->raw_re = r; } else { #ifndef WITH_PCRE2 res->raw_re = pcre_compile (real_pattern, regexp_flags & ~PCRE_FLAG(UTF8), (const char **)&err_str, &err_off, NULL); (void)err_code; #else res->raw_re = pcre2_compile (real_pattern, PCRE2_ZERO_TERMINATED, regexp_flags & ~PCRE_FLAG(UTF), &err_code, &err_off, pcre2_ctx); if (res->raw_re == NULL) { err_str = g_alloca (1024); memset (err_str, 0, 1024); pcre2_get_error_message (err_code, err_str, 1024); } #endif if (res->raw_re == NULL) { msg_warn ("raw regexp parsing error: '%s': '%s' at position %d", err_str, real_pattern, (gint)err_off); } } rspamd_regexp_post_process (res); rspamd_regexp_generate_id (pattern, flags, res->id); #ifndef WITH_PCRE2 /* Check number of captures */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } /* Check number of backrefs */ if (pcre_fullinfo (res->raw_re, res->extra, PCRE_INFO_BACKREFMAX, &ncaptures) == 0) { res->nbackref = ncaptures; } #else /* Check number of captures */ if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_CAPTURECOUNT, &ncaptures) == 0) { res->ncaptures = ncaptures; } /* Check number of backrefs */ if (pcre2_pattern_info (res->raw_re, PCRE2_INFO_BACKREFMAX, &ncaptures) == 0) { res->nbackref = ncaptures; } #endif return res; }