Example #1
0
struct rspamd_shingle*
rspamd_shingles_generate (GArray *input,
		const guchar key[16],
		rspamd_mempool_t *pool,
		rspamd_shingles_filter filter,
		gpointer filterd)
{
	struct rspamd_shingle *res;
	GArray *hashes[RSPAMD_SHINGLE_SIZE];
	rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
	guchar shabuf[BLAKE2B_OUTBYTES], *out_key;
	const guchar *cur_key;
	GString *row;
	rspamd_fstring_t *word;
	blake2b_state bs;
	guint64 val;
	gint i, j, beg = 0;
	guint8 shalen;

	if (pool != NULL) {
		res = rspamd_mempool_alloc (pool, sizeof (*res));
	}
	else {
		res = g_malloc (sizeof (*res));
	}

	blake2b_init (&bs, BLAKE2B_OUTBYTES);
	row = g_string_sized_new (256);
	cur_key = key;
	out_key = (guchar *)&keys[0];

	/* Init hashes pipes and keys */
	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64),
				input->len + SHINGLES_WINDOW);
		/*
		 * To generate a set of hashes we just apply sha256 to the
		 * initial key as many times as many hashes are required and
		 * xor left and right parts of sha256 to get a single 16 bytes SIP key.
		 */
		shalen = sizeof (shabuf);
		blake2b_update (&bs, cur_key, 16);
		blake2b_final (&bs, shabuf, shalen);

		for (j = 0; j < 16; j ++) {
			out_key[j] = shabuf[j];
		}
		blake2b_init (&bs, BLAKE2B_OUTBYTES);
		cur_key = out_key;
		out_key += 16;
	}

	/* Now parse input words into a vector of hashes using rolling window */
	for (i = 0; i <= (gint)input->len; i ++) {
		if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
			for (j = beg; j < i; j ++) {
				word = &g_array_index (input, rspamd_fstring_t, j);
				g_string_append_len (row, word->begin, word->len);
			}
			beg++;

			/* Now we need to create a new row here */
			for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
				rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
						keys[j]);
				g_array_append_val (hashes[j], val);
			}
			g_string_assign (row, "");
		}
	}

	/* Now we need to filter all hashes and make a shingles result */
	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len,
				i, key, filterd);
		g_array_free (hashes[i], TRUE);
	}

	g_string_free (row, TRUE);

	return res;
}
Example #2
0
rspamd_shingles_from_text (GArray *input,
		const guchar key[16],
		rspamd_mempool_t *pool,
		rspamd_shingles_filter filter,
		gpointer filterd,
		enum rspamd_shingle_alg alg)
{
	struct rspamd_shingle *res;
	guint64 **hashes;
	guchar **keys;
	rspamd_fstring_t *row;
	rspamd_stat_token_t *word;
	guint64 val;
	gint i, j, k;
	gsize hlen, beg = 0;
	enum rspamd_cryptobox_fast_hash_type ht;

	if (pool != NULL) {
		res = rspamd_mempool_alloc (pool, sizeof (*res));
	}
	else {
		res = g_malloc (sizeof (*res));
	}

	row = rspamd_fstring_sized_new (256);

	/* Init hashes pipes and keys */
	hashes = g_malloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
	hlen = input->len > SHINGLES_WINDOW ?
			(input->len - SHINGLES_WINDOW + 1) : 1;
	keys = rspamd_shingles_get_keys_cached (key);

	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		hashes[i] = g_malloc (hlen * sizeof (guint64));
	}

	/* Now parse input words into a vector of hashes using rolling window */
	if (alg == RSPAMD_SHINGLES_OLD) {
		for (i = 0; i <= (gint)input->len; i ++) {
			if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
				for (j = beg; j < i; j ++) {
					word = &g_array_index (input, rspamd_stat_token_t, j);
					row = rspamd_fstring_append (row, word->begin, word->len);
				}

				/* Now we need to create a new row here */
				for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
					rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
							keys[j]);
					g_assert (hlen > beg);
					hashes[j][beg] = val;
				}

				beg++;

				row = rspamd_fstring_assign (row, "", 0);
			}
		}
	}
	else {
		guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;

		switch (alg) {
		case RSPAMD_SHINGLES_XXHASH:
			ht = RSPAMD_CRYPTOBOX_XXHASH64;
			break;
		case RSPAMD_SHINGLES_MUMHASH:
			ht = RSPAMD_CRYPTOBOX_MUMHASH;
			break;
		default:
			ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
			break;
		}

		memset (res, 0, sizeof (res));
		for (i = 0; i <= (gint)input->len; i ++) {
			if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {

				for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
					/* Shift hashes window to right */
					for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
						res[j * SHINGLES_WINDOW + k] =
								res[j * SHINGLES_WINDOW + k + 1];
					}

					word = &g_array_index (input, rspamd_stat_token_t, beg);
					/* Insert the last element to the pipe */
					memcpy (&seed, keys[j], sizeof (seed));
					res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
							rspamd_cryptobox_fast_hash_specific (ht,
									word->begin, word->len,
									seed);
					val = 0;
					for (k = 0; k < SHINGLES_WINDOW; k ++) {
						val ^= res[j * SHINGLES_WINDOW + k] >>
								(8 * (SHINGLES_WINDOW - k - 1));
					}

					g_assert (hlen > beg);
					hashes[j][beg] = val;
				}
				beg++;
			}
		}
	}

	/* Now we need to filter all hashes and make a shingles result */
	for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
		res->hashes[i] = filter (hashes[i], hlen,
				i, key, filterd);
		g_free (hashes[i]);
	}

	g_free (hashes);

	rspamd_fstring_free (row);

	return res;
}
Example #3
0
gint
rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
		rspamd_mempool_t *pool,
		GArray *words,
		gboolean is_utf,
		const gchar *prefix,
		GPtrArray *result)
{
	rspamd_token_t *new_tok = NULL;
	rspamd_stat_token_t *token;
	struct rspamd_osb_tokenizer_config *osb_cf;
	guint64 cur, seed;
	struct token_pipe_entry *hashpipe;
	guint32 h1, h2;
	gsize token_size;
	guint processed = 0, i, w, window_size, token_flags = 0;

	if (words == NULL) {
		return FALSE;
	}

	osb_cf = ctx->tkcf;
	window_size = osb_cf->window_size;

	if (prefix) {
		seed = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
				prefix, strlen (prefix), osb_cf->seed);
	}
	else {
		seed = osb_cf->seed;
	}

	hashpipe = g_alloca (window_size * sizeof (hashpipe[0]));
	for (i = 0; i < window_size; i++) {
		hashpipe[i].h = 0xfe;
		hashpipe[i].t = NULL;
	}

	token_size = sizeof (rspamd_token_t) +
			sizeof (gdouble) * ctx->statfiles->len;
	g_assert (token_size > 0);

	for (w = 0; w < words->len; w ++) {
		token = &g_array_index (words, rspamd_stat_token_t, w);
		token_flags = token->flags;

		if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
			rspamd_ftok_t ftok;

			ftok.begin = token->begin;
			ftok.len = token->len;
			cur = rspamd_fstrhash_lc (&ftok, is_utf);
		}
		else {
			/* We know that the words are normalized */
			if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) {
				cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
						token->begin, token->len, osb_cf->seed);
			}
			else {
				rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
						token->len, osb_cf->sk);

				if (prefix) {
					cur ^= seed;
				}
			}
		}

		if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
			new_tok = rspamd_mempool_alloc0 (pool, token_size);
			new_tok->flags = token_flags;
			new_tok->t1 = token;
			new_tok->t2 = token;
			new_tok->data = cur;
			new_tok->window_idx = 0;
			g_ptr_array_add (result, new_tok);

			continue;
		}

#define ADD_TOKEN do {\
    new_tok = rspamd_mempool_alloc0 (pool, token_size); \
    new_tok->flags = token_flags; \
    new_tok->t1 = hashpipe[0].t; \
    new_tok->t2 = hashpipe[i].t; \
    if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \
        h1 = ((guint32)hashpipe[0].h) * primes[0] + \
            ((guint32)hashpipe[i].h) * primes[i << 1]; \
        h2 = ((guint32)hashpipe[0].h) * primes[1] + \
            ((guint32)hashpipe[i].h) * primes[(i << 1) - 1]; \
        memcpy((guchar *)&new_tok->data, &h1, sizeof (h1)); \
        memcpy(((guchar *)&new_tok->data) + sizeof (h1), &h2, sizeof (h2)); \
    } \
    else { \
        new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
    } \
    new_tok->window_idx = i + 1; \
    g_ptr_array_add (result, new_tok); \
  } while(0)

		if (processed < window_size) {
			/* Just fill a hashpipe */
			++processed;
			hashpipe[window_size - processed].h = cur;
			hashpipe[window_size - processed].t = token;
		}
		else {
			/* Shift hashpipe */
			for (i = window_size - 1; i > 0; i--) {
				hashpipe[i] = hashpipe[i - 1];
			}
			hashpipe[0].h = cur;
			hashpipe[0].t = token;

			processed++;

			for (i = 1; i < window_size; i++) {
				ADD_TOKEN;
			}
		}
	}

	if (processed > 1 && processed <= window_size) {
		processed --;
		memmove (hashpipe, &hashpipe[window_size - processed],
				processed * sizeof (hashpipe[0]));

		for (i = 1; i < processed; i++) {
			ADD_TOKEN;
		}
	}

#undef ADD_TOKEN

	return TRUE;
}