gint rspamd_re_cache_process (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, enum rspamd_re_type type, gpointer type_data, gsize datalen, gboolean is_strong) { guint64 re_id; struct rspamd_re_class *re_class; struct rspamd_re_cache *cache; g_assert (rt != NULL); g_assert (task != NULL); g_assert (re != NULL); cache = rt->cache; re_id = rspamd_regexp_get_cache_id (re); if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) { msg_err_task ("re '%s' has no valid id for the cache", rspamd_regexp_get_pattern (re)); return 0; } if (isset (rt->checked, re_id)) { /* Fast path */ rt->stat.regexp_fast_cached ++; return rt->results[re_id]; } else { /* Slow path */ re_class = rspamd_regexp_get_class (re); if (re_class == NULL) { msg_err_task ("cannot find re class for regexp '%s'", rspamd_regexp_get_pattern (re)); return 0; } return rspamd_re_cache_exec_re (task, rt, re, re_class, is_strong); } return 0; }
static guint rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt, rspamd_regexp_t *re, rspamd_mempool_t *pool, const guchar *in, gsize len, gboolean is_raw) { guint r = 0; const gchar *start = NULL, *end = NULL; guint max_hits = rspamd_regexp_get_maxhits (re); guint64 id = rspamd_regexp_get_cache_id (re); gdouble t1, t2; const gdouble slow_time = 0.1; if (len == 0) { len = strlen (in); } if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) { len = rt->cache->max_re_data; } r = rt->results[id]; t1 = rspamd_get_ticks (); if (max_hits == 0 || r < max_hits) { while (rspamd_regexp_search (re, in, len, &start, &end, is_raw, NULL)) { r++; if (max_hits > 0 && r >= max_hits) { break; } } rt->stat.regexp_checked++; rt->stat.bytes_scanned_pcre += len; rt->stat.bytes_scanned += len; if (r > 0) { rt->stat.regexp_matched += r; } } t2 = rspamd_get_ticks (); if (t2 - t1 > slow_time) { msg_info_pool ("regexp '%16s' took %.2f seconds to execute", rspamd_regexp_get_pattern (re), t2 - t1); } return r; }
/* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GPtrArray *headerlist; GHashTableIter it; struct raw_header *rh; const gchar *in, *end; const guchar **scvec; guint *lenvec; gboolean raw = FALSE; struct mime_text_part *part; struct rspamd_url *url; struct rspamd_re_cache *cache = rt->cache; gpointer k, v; guint len, cnt; msg_debug_re_cache ("get to the slow path for re type: %s: %s", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header_array (task, re_class->type_data, is_strong); if (headerlist) { scvec = g_malloc (sizeof (*scvec) * headerlist->len); lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); for (i = 0; i < headerlist->len; i ++) { rh = g_ptr_array_index (headerlist, i); if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; lenvec[i] = strlen (rh->value); } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, &end)) { lenvec[i] = 0; scvec[i] = (guchar *)""; continue; } lenvec[i] = end - in; } scvec[i] = (guchar *)in; } ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, headerlist->len, raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, (const guchar **)&in, &len, 1, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate through text parts */ if (task->text_parts->len > 0) { scvec = g_malloc (sizeof (*scvec) * task->text_parts->len); lenvec = g_malloc (sizeof (*lenvec) * task->text_parts->len); for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { lenvec[i] = 0; scvec[i] = (guchar *) ""; continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->orig->data; len = part->orig->len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } scvec[i] = (guchar *) in; lenvec[i] = len; } ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, task->text_parts->len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_URL: cnt = g_hash_table_size (task->urls) + g_hash_table_size (task->emails); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); g_hash_table_iter_init (&it, task->urls); i = 0; while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *)in; lenvec[i++] = len; } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *) in; lenvec[i++] = len; } g_assert (i == cnt); ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, i, raw); debug_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, (const guchar **)&in, &len, 1, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; }
gint rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache, const char *cache_dir, gdouble max_time, gboolean silent, GError **err) { g_assert (cache != NULL); g_assert (cache_dir != NULL); #ifndef WITH_HYPERSCAN g_set_error (err, rspamd_re_cache_quark (), EINVAL, "hyperscan is disabled"); return -1; #else GHashTableIter it, cit; gpointer k, v; struct rspamd_re_class *re_class; gchar path[PATH_MAX]; hs_database_t *test_db; gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags; guint64 crc; rspamd_regexp_t *re; hs_compile_error_t *hs_errors; guint *hs_flags = NULL; const gchar **hs_pats = NULL; gchar *hs_serialized; gsize serialized_len, total = 0; struct iovec iov[7]; g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) { fd = open (path, O_RDONLY, 00600); /* Read number of regexps */ g_assert (fd != -1); lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET); read (fd, &n, sizeof (n)); close (fd); if (re_class->type_len > 0) { if (!silent) { msg_info_re_cache ( "skip already valid class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } } else { if (!silent) { msg_info_re_cache ( "skip already valid class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } } continue; } fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600); if (fd == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot open file " "%s: %s", path, strerror (errno)); return -1; } g_hash_table_iter_init (&cit, re_class->re); n = g_hash_table_size (re_class->re); hs_flags = g_malloc0 (sizeof (*hs_flags) * n); hs_ids = g_malloc (sizeof (*hs_ids) * n); hs_pats = g_malloc (sizeof (*hs_pats) * n); i = 0; while (g_hash_table_iter_next (&cit, &k, &v)) { re = v; pcre_flags = rspamd_regexp_get_pcre_flags (re); re_flags = rspamd_regexp_get_flags (re); if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) { /* Do not try to compile bad regexp */ msg_info_re_cache ( "do not try compile %s to hyperscan as it is PCRE only", rspamd_regexp_get_pattern (re)); continue; } hs_flags[i] = 0; #ifndef WITH_PCRE2 if (pcre_flags & PCRE_FLAG(UTF8)) { hs_flags[i] |= HS_FLAG_UTF8; } #else if (pcre_flags & PCRE_FLAG(UTF)) { hs_flags[i] |= HS_FLAG_UTF8; } #endif if (pcre_flags & PCRE_FLAG(CASELESS)) { hs_flags[i] |= HS_FLAG_CASELESS; } if (pcre_flags & PCRE_FLAG(MULTILINE)) { hs_flags[i] |= HS_FLAG_MULTILINE; } if (pcre_flags & PCRE_FLAG(DOTALL)) { hs_flags[i] |= HS_FLAG_DOTALL; } if (rspamd_regexp_get_maxhits (re) == 1) { hs_flags[i] |= HS_FLAG_SINGLEMATCH; } if (hs_compile (rspamd_regexp_get_pattern (re), hs_flags[i], cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { msg_info_re_cache ("cannot compile %s to hyperscan, try prefilter match", rspamd_regexp_get_pattern (re)); hs_free_compile_error (hs_errors); /* The approximation operation might take a significant * amount of time, so we need to check if it's finite */ if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], max_time)) { hs_flags[i] |= HS_FLAG_PREFILTER; hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i++; } } else { hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i ++; hs_free_database (test_db); } } /* Adjust real re number */ n = i; if (n > 0) { /* Create the hs tree */ if (hs_compile_multi (hs_pats, hs_flags, hs_ids, n, cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), EINVAL, "cannot create tree of regexp when processing '%s': %s", hs_pats[hs_errors->expression], hs_errors->message); g_free (hs_flags); g_free (hs_ids); g_free (hs_pats); close (fd); hs_free_compile_error (hs_errors); return -1; } g_free (hs_pats); if (hs_serialize_database (test_db, &hs_serialized, &serialized_len) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp for %s", re_class->hash); close (fd); g_free (hs_ids); g_free (hs_flags); hs_free_database (test_db); return -1; } hs_free_database (test_db); /* * Magic - 8 bytes * Platform - sizeof (platform) * n - number of regexps * n * <regexp ids> * n * <regexp flags> * crc - 8 bytes checksum * <hyperscan blob> */ crc = XXH64 (hs_serialized, serialized_len, 0xdeadbabe); if (cache->vectorized_hyperscan) { iov[0].iov_base = (void *) rspamd_hs_magic_vector; } else { iov[0].iov_base = (void *) rspamd_hs_magic; } iov[0].iov_len = RSPAMD_HS_MAGIC_LEN; iov[1].iov_base = &cache->plt; iov[1].iov_len = sizeof (cache->plt); iov[2].iov_base = &n; iov[2].iov_len = sizeof (n); iov[3].iov_base = hs_ids; iov[3].iov_len = sizeof (*hs_ids) * n; iov[4].iov_base = hs_flags; iov[4].iov_len = sizeof (*hs_flags) * n; iov[5].iov_base = &crc; iov[5].iov_len = sizeof (crc); iov[6].iov_base = hs_serialized; iov[6].iov_len = serialized_len; if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp to %s: %s", path, strerror (errno)); close (fd); g_free (hs_ids); g_free (hs_flags); g_free (hs_serialized); return -1; } if (re_class->type_len > 0) { msg_info_re_cache ( "compiled class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } else { msg_info_re_cache ( "compiled class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } total += n; g_free (hs_serialized); g_free (hs_ids); g_free (hs_flags); } close (fd); } return total; #endif }
static gboolean rspamd_re_cache_is_finite (struct rspamd_re_cache *cache, rspamd_regexp_t *re, gint flags, gdouble max_time) { pid_t cld; gint status; struct timespec ts; hs_compile_error_t *hs_errors; hs_database_t *test_db; gdouble wait_time; const gint max_tries = 10; gint tries = 0, rc; wait_time = max_time / max_tries; /* We need to restore SIGCHLD processing */ signal (SIGCHLD, SIG_DFL); cld = fork (); g_assert (cld != -1); if (cld == 0) { /* Try to compile pattern */ if (hs_compile (rspamd_regexp_get_pattern (re), flags | HS_FLAG_PREFILTER, cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { exit (EXIT_FAILURE); } exit (EXIT_SUCCESS); } else { double_to_ts (wait_time, &ts); while ((rc = waitpid (cld, &status, WNOHANG)) == 0 && tries ++ < max_tries) { (void)nanosleep (&ts, NULL); } /* Child has been terminated */ if (rc > 0) { /* Forget about SIGCHLD after this point */ signal (SIGCHLD, SIG_IGN); if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS) { return TRUE; } else { msg_err_re_cache ( "cannot approximate %s to hyperscan", rspamd_regexp_get_pattern (re)); return FALSE; } } else { /* We consider that as timeout */ kill (cld, SIGKILL); g_assert (waitpid (cld, &status, 0) != -1); msg_err_re_cache ( "cannot approximate %s to hyperscan: timeout waiting", rspamd_regexp_get_pattern (re)); signal (SIGCHLD, SIG_IGN); } } return FALSE; }
/* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GList *cur, *headerlist; GHashTableIter it; struct raw_header *rh; const gchar *in; gboolean raw = FALSE; struct mime_text_part *part; struct rspamd_url *url; struct rspamd_re_cache *cache = rt->cache; gpointer k, v; gsize len; msg_debug_re_cache ("get to the slow path for re type: %s: %s", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header (task, re_class->type_data, is_strong); if (headerlist) { cur = headerlist; while (cur) { rh = cur->data; if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, NULL)) { cur = g_list_next (cur); continue; } } /* Match re */ if (in) { ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, strlen (in), raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); } cur = g_list_next (cur); } } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate throught text parts */ for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->orig->data; len = part->orig->len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } if (len > 0) { ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); } } break; case RSPAMD_RE_URL: g_hash_table_iter_init (&it, task->urls); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); } debug_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; }