Exemple #1
0
// Convenience method for getting the edit distance of two strings
int edit_dist(const char *a, const char *b) {
  unsigned int a_len = 0, b_len = 0;
  if (a)
    a_len = strlen(a);
  if (b)
    b_len = strlen(b);

  return edit_distn(a, a_len, b, b_len);
}
Exemple #2
0
/*
  this is the low level string scoring algorithm. It takes two strings
  and scores them on a scale of 0-100 where 0 is a terrible match and
  100 is a great match. The block_size is used to cope with very small
  messages.
*/
static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
{
	u32 score;
	u32 len1, len2;
	int edit_distn(const char *from, int from_len, const char *to, int to_len);

	len1 = strlen(s1);
	len2 = strlen(s2);

	if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
		/* not a real spamsum signature? */
		return 0;
	}

	/* the two strings must have a common substring of length
	   ROLLING_WINDOW to be candidates */
	if (has_common_substring(s1, s2) == 0) {
		return 0;
	}

	/* compute the edit distance between the two strings. The edit distance gives
	   us a pretty good idea of how closely related the two strings are */
	score = edit_distn(s1, len1, s2, len2);

	/* scale the edit distance by the lengths of the two
	   strings. This changes the score to be a measure of the
	   proportion of the message that has changed rather than an
	   absolute quantity. It also copes with the variability of
	   the string lengths. */
	score = (score * SPAMSUM_LENGTH) / (len1 + len2);

	/* at this stage the score occurs roughly on a 0-64 scale,
	 * with 0 being a good match and 64 being a complete
	 * mismatch */

	/* rescale to a 0-100 scale (friendlier to humans) */
	score = (100 * score) / 64;

	/* it is possible to get a score above 100 here, but it is a
	   really terrible match */
	if (score >= 100) return 0;

	/* now re-scale on a 0-100 scale with 0 being a poor match and
	   100 being a excellent match. */
	score = 100 - score;

	/* when the blocksize is small we don't want to exaggerate the match size */
	if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
		score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
	}

	return score;
}