struct rspamd_shingle* rspamd_shingles_generate (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, gpointer filterd) { struct rspamd_shingle *res; GArray *hashes[RSPAMD_SHINGLE_SIZE]; rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE]; guchar shabuf[BLAKE2B_OUTBYTES], *out_key; const guchar *cur_key; GString *row; rspamd_fstring_t *word; blake2b_state bs; guint64 val; gint i, j, beg = 0; guint8 shalen; if (pool != NULL) { res = rspamd_mempool_alloc (pool, sizeof (*res)); } else { res = g_malloc (sizeof (*res)); } blake2b_init (&bs, BLAKE2B_OUTBYTES); row = g_string_sized_new (256); cur_key = key; out_key = (guchar *)&keys[0]; /* Init hashes pipes and keys */ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64), input->len + SHINGLES_WINDOW); /* * To generate a set of hashes we just apply sha256 to the * initial key as many times as many hashes are required and * xor left and right parts of sha256 to get a single 16 bytes SIP key. */ shalen = sizeof (shabuf); blake2b_update (&bs, cur_key, 16); blake2b_final (&bs, shabuf, shalen); for (j = 0; j < 16; j ++) { out_key[j] = shabuf[j]; } blake2b_init (&bs, BLAKE2B_OUTBYTES); cur_key = out_key; out_key += 16; } /* Now parse input words into a vector of hashes using rolling window */ for (i = 0; i <= (gint)input->len; i ++) { if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { for (j = beg; j < i; j ++) { word = &g_array_index (input, rspamd_fstring_t, j); g_string_append_len (row, word->begin, word->len); } beg++; /* Now we need to create a new row here */ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, keys[j]); g_array_append_val (hashes[j], val); } g_string_assign (row, ""); } } /* Now we need to filter all hashes and make a shingles result */ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len, i, key, filterd); g_array_free (hashes[i], TRUE); } g_string_free (row, TRUE); return res; }
rspamd_shingles_from_text (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, gpointer filterd, enum rspamd_shingle_alg alg) { struct rspamd_shingle *res; guint64 **hashes; guchar **keys; rspamd_fstring_t *row; rspamd_stat_token_t *word; guint64 val; gint i, j, k; gsize hlen, beg = 0; enum rspamd_cryptobox_fast_hash_type ht; if (pool != NULL) { res = rspamd_mempool_alloc (pool, sizeof (*res)); } else { res = g_malloc (sizeof (*res)); } row = rspamd_fstring_sized_new (256); /* Init hashes pipes and keys */ hashes = g_malloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE); hlen = input->len > SHINGLES_WINDOW ? (input->len - SHINGLES_WINDOW + 1) : 1; keys = rspamd_shingles_get_keys_cached (key); for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { hashes[i] = g_malloc (hlen * sizeof (guint64)); } /* Now parse input words into a vector of hashes using rolling window */ if (alg == RSPAMD_SHINGLES_OLD) { for (i = 0; i <= (gint)input->len; i ++) { if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { for (j = beg; j < i; j ++) { word = &g_array_index (input, rspamd_stat_token_t, j); row = rspamd_fstring_append (row, word->begin, word->len); } /* Now we need to create a new row here */ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, keys[j]); g_assert (hlen > beg); hashes[j][beg] = val; } beg++; row = rspamd_fstring_assign (row, "", 0); } } } else { guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed; switch (alg) { case RSPAMD_SHINGLES_XXHASH: ht = RSPAMD_CRYPTOBOX_XXHASH64; break; case RSPAMD_SHINGLES_MUMHASH: ht = RSPAMD_CRYPTOBOX_MUMHASH; break; default: ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT; break; } memset (res, 0, sizeof (res)); for (i = 0; i <= (gint)input->len; i ++) { if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { /* Shift hashes window to right */ for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { res[j * SHINGLES_WINDOW + k] = res[j * SHINGLES_WINDOW + k + 1]; } word = &g_array_index (input, rspamd_stat_token_t, beg); /* Insert the last element to the pipe */ memcpy (&seed, keys[j], sizeof (seed)); res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = rspamd_cryptobox_fast_hash_specific (ht, word->begin, word->len, seed); val = 0; for (k = 0; k < SHINGLES_WINDOW; k ++) { val ^= res[j * SHINGLES_WINDOW + k] >> (8 * (SHINGLES_WINDOW - k - 1)); } g_assert (hlen > beg); hashes[j][beg] = val; } beg++; } } } /* Now we need to filter all hashes and make a shingles result */ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { res->hashes[i] = filter (hashes[i], hlen, i, key, filterd); g_free (hashes[i]); } g_free (hashes); rspamd_fstring_free (row); return res; }
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, rspamd_mempool_t *pool, GArray *words, gboolean is_utf, const gchar *prefix, GPtrArray *result) { rspamd_token_t *new_tok = NULL; rspamd_stat_token_t *token; struct rspamd_osb_tokenizer_config *osb_cf; guint64 cur, seed; struct token_pipe_entry *hashpipe; guint32 h1, h2; gsize token_size; guint processed = 0, i, w, window_size, token_flags = 0; if (words == NULL) { return FALSE; } osb_cf = ctx->tkcf; window_size = osb_cf->window_size; if (prefix) { seed = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, prefix, strlen (prefix), osb_cf->seed); } else { seed = osb_cf->seed; } hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); for (i = 0; i < window_size; i++) { hashpipe[i].h = 0xfe; hashpipe[i].t = NULL; } token_size = sizeof (rspamd_token_t) + sizeof (gdouble) * ctx->statfiles->len; g_assert (token_size > 0); for (w = 0; w < words->len; w ++) { token = &g_array_index (words, rspamd_stat_token_t, w); token_flags = token->flags; if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { rspamd_ftok_t ftok; ftok.begin = token->begin; ftok.len = token->len; cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, token->begin, token->len, osb_cf->seed); } else { rspamd_cryptobox_siphash ((guchar *)&cur, token->begin, token->len, osb_cf->sk); if (prefix) { cur ^= seed; } } } if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { new_tok = rspamd_mempool_alloc0 (pool, token_size); new_tok->flags = token_flags; new_tok->t1 = token; new_tok->t2 = token; new_tok->data = cur; new_tok->window_idx = 0; g_ptr_array_add (result, new_tok); continue; } #define ADD_TOKEN do {\ new_tok = rspamd_mempool_alloc0 (pool, token_size); \ new_tok->flags = token_flags; \ new_tok->t1 = hashpipe[0].t; \ new_tok->t2 = hashpipe[i].t; \ if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ h1 = ((guint32)hashpipe[0].h) * primes[0] + \ ((guint32)hashpipe[i].h) * primes[i << 1]; \ h2 = ((guint32)hashpipe[0].h) * primes[1] + \ ((guint32)hashpipe[i].h) * primes[(i << 1) - 1]; \ memcpy((guchar *)&new_tok->data, &h1, sizeof (h1)); \ memcpy(((guchar *)&new_tok->data) + sizeof (h1), &h2, sizeof (h2)); \ } \ else { \ new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ } \ new_tok->window_idx = i + 1; \ g_ptr_array_add (result, new_tok); \ } while(0) if (processed < window_size) { /* Just fill a hashpipe */ ++processed; hashpipe[window_size - processed].h = cur; hashpipe[window_size - processed].t = token; } else { /* Shift hashpipe */ for (i = window_size - 1; i > 0; i--) { hashpipe[i] = hashpipe[i - 1]; } hashpipe[0].h = cur; hashpipe[0].t = token; processed++; for (i = 1; i < window_size; i++) { ADD_TOKEN; } } } if (processed > 1 && processed <= window_size) { processed --; memmove (hashpipe, &hashpipe[window_size - processed], processed * sizeof (hashpipe[0])); for (i = 1; i < processed; i++) { ADD_TOKEN; } } #undef ADD_TOKEN return TRUE; }