gint rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache, const char *cache_dir, gdouble max_time, gboolean silent, GError **err) { g_assert (cache != NULL); g_assert (cache_dir != NULL); #ifndef WITH_HYPERSCAN g_set_error (err, rspamd_re_cache_quark (), EINVAL, "hyperscan is disabled"); return -1; #else GHashTableIter it, cit; gpointer k, v; struct rspamd_re_class *re_class; gchar path[PATH_MAX]; hs_database_t *test_db; gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags; guint64 crc; rspamd_regexp_t *re; hs_compile_error_t *hs_errors; guint *hs_flags = NULL; const gchar **hs_pats = NULL; gchar *hs_serialized; gsize serialized_len, total = 0; struct iovec iov[7]; g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) { fd = open (path, O_RDONLY, 00600); /* Read number of regexps */ g_assert (fd != -1); lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET); read (fd, &n, sizeof (n)); close (fd); if (re_class->type_len > 0) { if (!silent) { msg_info_re_cache ( "skip already valid class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } } else { if (!silent) { msg_info_re_cache ( "skip already valid class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } } continue; } fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600); if (fd == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot open file " "%s: %s", path, strerror (errno)); return -1; } g_hash_table_iter_init (&cit, re_class->re); n = g_hash_table_size (re_class->re); hs_flags = g_malloc0 (sizeof (*hs_flags) * n); hs_ids = g_malloc (sizeof (*hs_ids) * n); hs_pats = g_malloc (sizeof (*hs_pats) * n); i = 0; while (g_hash_table_iter_next (&cit, &k, &v)) { re = v; pcre_flags = rspamd_regexp_get_pcre_flags (re); re_flags = rspamd_regexp_get_flags (re); if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) { /* Do not try to compile bad regexp */ msg_info_re_cache ( "do not try compile %s to hyperscan as it is PCRE only", rspamd_regexp_get_pattern (re)); continue; } hs_flags[i] = 0; #ifndef WITH_PCRE2 if (pcre_flags & PCRE_FLAG(UTF8)) { hs_flags[i] |= HS_FLAG_UTF8; } #else if (pcre_flags & PCRE_FLAG(UTF)) { hs_flags[i] |= HS_FLAG_UTF8; } #endif if (pcre_flags & PCRE_FLAG(CASELESS)) { hs_flags[i] |= HS_FLAG_CASELESS; } if (pcre_flags & PCRE_FLAG(MULTILINE)) { hs_flags[i] |= HS_FLAG_MULTILINE; } if (pcre_flags & PCRE_FLAG(DOTALL)) { hs_flags[i] |= HS_FLAG_DOTALL; } if (rspamd_regexp_get_maxhits (re) == 1) { hs_flags[i] |= HS_FLAG_SINGLEMATCH; } if (hs_compile (rspamd_regexp_get_pattern (re), hs_flags[i], cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { msg_info_re_cache ("cannot compile %s to hyperscan, try prefilter match", rspamd_regexp_get_pattern (re)); hs_free_compile_error (hs_errors); /* The approximation operation might take a significant * amount of time, so we need to check if it's finite */ if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], max_time)) { hs_flags[i] |= HS_FLAG_PREFILTER; hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i++; } } else { hs_ids[i] = rspamd_regexp_get_cache_id (re); hs_pats[i] = rspamd_regexp_get_pattern (re); i ++; hs_free_database (test_db); } } /* Adjust real re number */ n = i; if (n > 0) { /* Create the hs tree */ if (hs_compile_multi (hs_pats, hs_flags, hs_ids, n, cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK, &cache->plt, &test_db, &hs_errors) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), EINVAL, "cannot create tree of regexp when processing '%s': %s", hs_pats[hs_errors->expression], hs_errors->message); g_free (hs_flags); g_free (hs_ids); g_free (hs_pats); close (fd); hs_free_compile_error (hs_errors); return -1; } g_free (hs_pats); if (hs_serialize_database (test_db, &hs_serialized, &serialized_len) != HS_SUCCESS) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp for %s", re_class->hash); close (fd); g_free (hs_ids); g_free (hs_flags); hs_free_database (test_db); return -1; } hs_free_database (test_db); /* * Magic - 8 bytes * Platform - sizeof (platform) * n - number of regexps * n * <regexp ids> * n * <regexp flags> * crc - 8 bytes checksum * <hyperscan blob> */ crc = XXH64 (hs_serialized, serialized_len, 0xdeadbabe); if (cache->vectorized_hyperscan) { iov[0].iov_base = (void *) rspamd_hs_magic_vector; } else { iov[0].iov_base = (void *) rspamd_hs_magic; } iov[0].iov_len = RSPAMD_HS_MAGIC_LEN; iov[1].iov_base = &cache->plt; iov[1].iov_len = sizeof (cache->plt); iov[2].iov_base = &n; iov[2].iov_len = sizeof (n); iov[3].iov_base = hs_ids; iov[3].iov_len = sizeof (*hs_ids) * n; iov[4].iov_base = hs_flags; iov[4].iov_len = sizeof (*hs_flags) * n; iov[5].iov_base = &crc; iov[5].iov_len = sizeof (crc); iov[6].iov_base = hs_serialized; iov[6].iov_len = serialized_len; if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) { g_set_error (err, rspamd_re_cache_quark (), errno, "cannot serialize tree of regexp to %s: %s", path, strerror (errno)); close (fd); g_free (hs_ids); g_free (hs_flags); g_free (hs_serialized); return -1; } if (re_class->type_len > 0) { msg_info_re_cache ( "compiled class %s(%*s) to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), (gint) re_class->type_len - 1, re_class->type_data, re_class->hash, n); } else { msg_info_re_cache ( "compiled class %s to cache %6s, %d regexps", rspamd_re_cache_type_to_string (re_class->type), re_class->hash, n); } total += n; g_free (hs_serialized); g_free (hs_ids); g_free (hs_flags); } close (fd); } return total; #endif }
/* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GPtrArray *headerlist; GHashTableIter it; struct raw_header *rh; const gchar *in, *end; const guchar **scvec; guint *lenvec; gboolean raw = FALSE; struct mime_text_part *part; struct rspamd_url *url; struct rspamd_re_cache *cache = rt->cache; gpointer k, v; guint len, cnt; msg_debug_re_cache ("get to the slow path for re type: %s: %s", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header_array (task, re_class->type_data, is_strong); if (headerlist) { scvec = g_malloc (sizeof (*scvec) * headerlist->len); lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); for (i = 0; i < headerlist->len; i ++) { rh = g_ptr_array_index (headerlist, i); if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; lenvec[i] = strlen (rh->value); } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, &end)) { lenvec[i] = 0; scvec[i] = (guchar *)""; continue; } lenvec[i] = end - in; } scvec[i] = (guchar *)in; } ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, headerlist->len, raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, (const guchar **)&in, &len, 1, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate through text parts */ if (task->text_parts->len > 0) { scvec = g_malloc (sizeof (*scvec) * task->text_parts->len); lenvec = g_malloc (sizeof (*lenvec) * task->text_parts->len); for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { lenvec[i] = 0; scvec[i] = (guchar *) ""; continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->orig->data; len = part->orig->len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } scvec[i] = (guchar *) in; lenvec[i] = len; } ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, task->text_parts->len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_URL: cnt = g_hash_table_size (task->urls) + g_hash_table_size (task->emails); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); g_hash_table_iter_init (&it, task->urls); i = 0; while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *)in; lenvec[i++] = len; } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *) in; lenvec[i++] = len; } g_assert (i == cnt); ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, i, raw); debug_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, (const guchar **)&in, &len, 1, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; }
/* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GList *cur, *headerlist; GHashTableIter it; struct raw_header *rh; const gchar *in; gboolean raw = FALSE; struct mime_text_part *part; struct rspamd_url *url; struct rspamd_re_cache *cache = rt->cache; gpointer k, v; gsize len; msg_debug_re_cache ("get to the slow path for re type: %s: %s", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header (task, re_class->type_data, is_strong); if (headerlist) { cur = headerlist; while (cur) { rh = cur->data; if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, NULL)) { cur = g_list_next (cur); continue; } } /* Match re */ if (in) { ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, strlen (in), raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); } cur = g_list_next (cur); } } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate throught text parts */ for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->orig->data; len = part->orig->len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } if (len > 0) { ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); } } break; case RSPAMD_RE_URL: g_hash_table_iter_init (&it, task->urls); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); } debug_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; }