// Convenience method for getting the edit distance of two strings int edit_dist(const char *a, const char *b) { unsigned int a_len = 0, b_len = 0; if (a) a_len = strlen(a); if (b) b_len = strlen(b); return edit_distn(a, a_len, b, b_len); }
/* this is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a terrible match and 100 is a great match. The block_size is used to cope with very small messages. */ static unsigned score_strings(const char *s1, const char *s2, u32 block_size) { u32 score; u32 len1, len2; int edit_distn(const char *from, int from_len, const char *to, int to_len); len1 = strlen(s1); len2 = strlen(s2); if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) { /* not a real spamsum signature? */ return 0; } /* the two strings must have a common substring of length ROLLING_WINDOW to be candidates */ if (has_common_substring(s1, s2) == 0) { return 0; } /* compute the edit distance between the two strings. The edit distance gives us a pretty good idea of how closely related the two strings are */ score = edit_distn(s1, len1, s2, len2); /* scale the edit distance by the lengths of the two strings. This changes the score to be a measure of the proportion of the message that has changed rather than an absolute quantity. It also copes with the variability of the string lengths. */ score = (score * SPAMSUM_LENGTH) / (len1 + len2); /* at this stage the score occurs roughly on a 0-64 scale, * with 0 being a good match and 64 being a complete * mismatch */ /* rescale to a 0-100 scale (friendlier to humans) */ score = (100 * score) / 64; /* it is possible to get a score above 100 here, but it is a really terrible match */ if (score >= 100) return 0; /* now re-scale on a 0-100 scale with 0 being a poor match and 100 being a excellent match. */ score = 100 - score; /* when the blocksize is small we don't want to exaggerate the match size */ if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) { score = block_size/MIN_BLOCKSIZE * MIN(len1, len2); } return score; }