// Assume that FFUZZY_MIN_MATCH == 7. static void examples() { // Example 1 ("abcdefghijklmn" and "hijklmnopqrstu"): // expect true because they have common substring "hijklmn". assert(has_common_substring("abcdefghijklmn", 14, "hijklmnopqrstu", 14) == true); // Example 2 ("commonstring" and "differentstring"): // expect false because they don't have common substrings of length FFUZZY_MIN_MATCH. assert(has_common_substring("commonstring", 12, "differentstring", 15) == false); // Example 3 ("abcdefg" and "abcdefg"): // expect true because they have common substring "abcdefg" (which is the whole string) assert(has_common_substring("abcdefg", 7, "abcdefg", 7) == true); // Example 3 ("abc" and "abc"): // expect false because they don't have common substrings of length FFUZZY_MIN_MATCH // (even if they are identical). assert(has_common_substring("abc", 3, "abc", 3) == false); }
/* this is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a terrible match and 100 is a great match. The block_size is used to cope with very small messages. */ static unsigned score_strings(const char *s1, const char *s2, u32 block_size) { u32 score; u32 len1, len2; int edit_distn(const char *from, int from_len, const char *to, int to_len); len1 = strlen(s1); len2 = strlen(s2); if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) { /* not a real spamsum signature? */ return 0; } /* the two strings must have a common substring of length ROLLING_WINDOW to be candidates */ if (has_common_substring(s1, s2) == 0) { return 0; } /* compute the edit distance between the two strings. The edit distance gives us a pretty good idea of how closely related the two strings are */ score = edit_distn(s1, len1, s2, len2); /* scale the edit distance by the lengths of the two strings. This changes the score to be a measure of the proportion of the message that has changed rather than an absolute quantity. It also copes with the variability of the string lengths. */ score = (score * SPAMSUM_LENGTH) / (len1 + len2); /* at this stage the score occurs roughly on a 0-64 scale, * with 0 being a good match and 64 being a complete * mismatch */ /* rescale to a 0-100 scale (friendlier to humans) */ score = (100 * score) / 64; /* it is possible to get a score above 100 here, but it is a really terrible match */ if (score >= 100) return 0; /* now re-scale on a 0-100 scale with 0 being a poor match and 100 being a excellent match. */ score = 100 - score; /* when the blocksize is small we don't want to exaggerate the match size */ if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) { score = block_size/MIN_BLOCKSIZE * MIN(len1, len2); } return score; }