gboolean rspamd_re_cache_load_hyperscan (struct rspamd_re_cache *cache, const char *cache_dir) { g_assert (cache != NULL); g_assert (cache_dir != NULL); #ifndef WITH_HYPERSCAN return FALSE; #else gchar path[PATH_MAX]; gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret; GHashTableIter it; gpointer k, v; guint8 *map, *p, *end; struct rspamd_re_class *re_class; struct rspamd_re_cache_elt *elt; struct stat st; g_hash_table_iter_init (&it, cache->re_classes); while (g_hash_table_iter_next (&it, &k, &v)) { re_class = v; rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir, G_DIR_SEPARATOR, re_class->hash); if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, FALSE, FALSE)) { msg_debug_re_cache ("load hyperscan database from '%s'", re_class->hash); fd = open (path, O_RDONLY); /* Read number of regexps */ g_assert (fd != -1); fstat (fd, &st); map = mmap (NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { msg_err_re_cache ("cannot mmap %s: %s", path, strerror (errno)); close (fd); return FALSE; } close (fd); end = map + st.st_size; p = map + RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt); n = *(gint *)p; if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */ sizeof (guint64) + /* crc */ RSPAMD_HS_MAGIC_LEN + /* header */ sizeof (cache->plt) > (gsize)st.st_size) { /* Some wrong amount of regexps */ msg_err_re_cache ("bad number of expressions in %s: %d", path, n); munmap (map, st.st_size); return FALSE; } total += n; p += sizeof (n); hs_ids = g_malloc (n * sizeof (*hs_ids)); memcpy (hs_ids, p, n * sizeof (*hs_ids)); p += n * sizeof (*hs_ids); hs_flags = g_malloc (n * sizeof (*hs_flags)); memcpy (hs_flags, p, n * sizeof (*hs_flags)); /* Skip crc */ p += n * sizeof (*hs_ids) + sizeof (guint64); /* Cleanup */ if (re_class->hs_scratch != NULL) { hs_free_scratch (re_class->hs_scratch); } if (re_class->hs_db != NULL) { hs_free_database (re_class->hs_db); } if (re_class->hs_ids) { g_free (re_class->hs_ids); } re_class->hs_ids = NULL; re_class->hs_scratch = NULL; re_class->hs_db = NULL; if ((ret = hs_deserialize_database (p, end - p, &re_class->hs_db)) != HS_SUCCESS) { msg_err_re_cache ("bad hs database in %s: %d", path, ret); munmap (map, st.st_size); g_free (hs_ids); g_free (hs_flags); return FALSE; } munmap (map, st.st_size); g_assert (hs_alloc_scratch (re_class->hs_db, &re_class->hs_scratch) == HS_SUCCESS); /* * Now find hyperscan elts that are successfully compiled and * specify that they should be matched using hyperscan */ for (i = 0; i < n; i ++) { g_assert ((gint)cache->re->len > hs_ids[i] && hs_ids[i] >= 0); elt = g_ptr_array_index (cache->re, hs_ids[i]); if (hs_flags[i] & HS_FLAG_PREFILTER) { elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE; } else { elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN; } } re_class->hs_ids = hs_ids; g_free (hs_flags); re_class->nhs = n; } else { msg_err_re_cache ("invalid hyperscan hash file '%s'", path); return FALSE; } } msg_info_re_cache ("hyperscan database of %d regexps has been loaded", total); cache->hyperscan_loaded = TRUE; return TRUE; #endif }
/* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GPtrArray *headerlist; GHashTableIter it; struct raw_header *rh; const gchar *in, *end; const guchar **scvec; guint *lenvec; gboolean raw = FALSE; struct mime_text_part *part; struct rspamd_url *url; struct rspamd_re_cache *cache = rt->cache; gpointer k, v; guint len, cnt; msg_debug_re_cache ("get to the slow path for re type: %s: %s", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header_array (task, re_class->type_data, is_strong); if (headerlist) { scvec = g_malloc (sizeof (*scvec) * headerlist->len); lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); for (i = 0; i < headerlist->len; i ++) { rh = g_ptr_array_index (headerlist, i); if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; lenvec[i] = strlen (rh->value); } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, &end)) { lenvec[i] = 0; scvec[i] = (guchar *)""; continue; } lenvec[i] = end - in; } scvec[i] = (guchar *)in; } ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, headerlist->len, raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, (const guchar **)&in, &len, 1, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate through text parts */ if (task->text_parts->len > 0) { scvec = g_malloc (sizeof (*scvec) * task->text_parts->len); lenvec = g_malloc (sizeof (*lenvec) * task->text_parts->len); for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { lenvec[i] = 0; scvec[i] = (guchar *) ""; continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->orig->data; len = part->orig->len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } scvec[i] = (guchar *) in; lenvec[i] = len; } ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, task->text_parts->len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_URL: cnt = g_hash_table_size (task->urls) + g_hash_table_size (task->emails); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); g_hash_table_iter_init (&it, task->urls); i = 0; while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *)in; lenvec[i++] = len; } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; scvec[i] = (guchar *) in; lenvec[i++] = len; } g_assert (i == cnt); ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, scvec, lenvec, i, raw); debug_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); g_free (scvec); g_free (lenvec); } break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, (const guchar **)&in, &len, 1, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; }
/* * Calculates the specified regexp for the specified class if it's not calculated */ static guint rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, gboolean is_strong) { guint ret = 0, i, re_id; GList *cur, *headerlist; GHashTableIter it; struct raw_header *rh; const gchar *in; gboolean raw = FALSE; struct mime_text_part *part; struct rspamd_url *url; struct rspamd_re_cache *cache = rt->cache; gpointer k, v; gsize len; msg_debug_re_cache ("get to the slow path for re type: %s: %s", rspamd_re_cache_type_to_string (re_class->type), rspamd_regexp_get_pattern (re)); re_id = rspamd_regexp_get_cache_id (re); switch (re_class->type) { case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ headerlist = rspamd_message_get_header (task, re_class->type_data, is_strong); if (headerlist) { cur = headerlist; while (cur) { rh = cur->data; if (re_class->type == RSPAMD_RE_RAWHEADER) { in = rh->value; raw = TRUE; } else { in = rh->decoded; /* Validate input */ if (!in || !g_utf8_validate (in, -1, NULL)) { cur = g_list_next (cur); continue; } } /* Match re */ if (in) { ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, strlen (in), raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); } cur = g_list_next (cur); } } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate throught text parts */ for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); /* Skip empty parts */ if (IS_PART_EMPTY (part)) { continue; } /* Check raw flags */ if (!IS_PART_UTF (part)) { raw = TRUE; } /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { in = part->orig->data; len = part->orig->len; raw = TRUE; } else { in = part->content->data; len = part->content->len; } if (len > 0) { ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); } } break; case RSPAMD_RE_URL: g_hash_table_iter_init (&it, task->urls); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); } g_hash_table_iter_init (&it, task->emails); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; in = url->string; len = url->urllen; raw = FALSE; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); } debug_task ("checking url regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_BODY: raw = TRUE; in = task->msg.begin; len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, task->task_pool, in, len, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MAX: msg_err_task ("regexp of class invalid has been called: %s", rspamd_regexp_get_pattern (re)); break; } #if WITH_HYPERSCAN if (!rt->cache->disable_hyperscan) { rspamd_re_cache_finish_class (rt, re_class); } #endif setbit (rt->checked, re_id); return rt->results[re_id]; }