static void compareseq (int xoff, int xlim, int yoff, int ylim, int minimal) { int * const xv = xvec; /* Help the compiler. */ int * const yv = yvec; /* Slide down the bottom initial diagonal. */ while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) ++xoff, ++yoff; /* Slide up the top initial diagonal. */ while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) --xlim, --ylim; /* Handle simple cases. */ if (xoff == xlim) while (yoff < ylim) files[1].changed_flag[files[1].realindexes[yoff++]] = 1; else if (yoff == ylim) while (xoff < xlim) files[0].changed_flag[files[0].realindexes[xoff++]] = 1; else { int c; struct partition part; /* Find a point of correspondence in the middle of the files. */ c = diag (xoff, xlim, yoff, ylim, minimal, &part); if (c == 1) { /* This should be impossible, because it implies that one of the two subsequences is empty, and that case was handled above without calling `diag'. Let's verify that this is true. */ abort (); #if 0 /* The two subsequences differ by a single insert or delete; record it and we are done. */ if (part.xmid - part.ymid < xoff - yoff) files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1; else files[0].changed_flag[files[0].realindexes[part.xmid]] = 1; #endif } else { /* Use the partitions to split this problem into subproblems. */ compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); } } }
double fstrcmp (const char *string1, const char *string2) { int i; size_t fdiag_len; static int *fdiag_buf; static size_t fdiag_max; /* set the info for each string. */ string[0].data = string1; string[0].data_length = strlen (string1); string[1].data = string2; string[1].data_length = strlen (string2); /* short-circuit obvious comparisons */ if (string[0].data_length == 0 && string[1].data_length == 0) return 1.0; if (string[0].data_length == 0 || string[1].data_length == 0) return 0.0; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ too_expensive = 1; for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2) too_expensive <<= 1; if (too_expensive < 256) too_expensive = 256; /* Because fstrcmp is typically called multiple times, while scanning symbol tables, etc, attempt to minimize the number of memory allocations performed. Thus, we use a static buffer for the diagonal vectors, and never free them. */ fdiag_len = string[0].data_length + string[1].data_length + 3; if (fdiag_len > fdiag_max) { fdiag_max = fdiag_len; fdiag_buf = xrealloc (fdiag_buf, fdiag_max * (2 * sizeof (int))); } fdiag = fdiag_buf + string[1].data_length + 1; bdiag = fdiag + fdiag_len; /* Now do the main comparison algorithm */ string[0].edit_count = 0; string[1].edit_count = 0; compareseq (0, string[0].data_length, 0, string[1].data_length, 0); /* The result is ((number of chars in common) / (average length of the strings)). This is admittedly biased towards finding that the strings are similar, however it does produce meaningful results. */ return ((double) (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count) / (string[0].data_length + string[1].data_length)); }
/* Report the differences of two files. */ void diff_2_files (struct file_data filevec[], add_change_callback callback, void* userData) { int diags; int i; /* Allocate vectors for the results of comparison: a flag for each line of each file, saying whether that line is an insertion or deletion. Allocate an extra element, always zero, at each end of each vector. */ size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4; filevec[0].changed_flag = (char *)malloc (s); bzero (filevec[0].changed_flag, s); filevec[0].changed_flag++; filevec[1].changed_flag = filevec[0].changed_flag + filevec[0].buffered_lines + 2; /* Some lines are obviously insertions or deletions because they don't match anything. Detect them now, and avoid even thinking about them in the main comparison algorithm. */ discard_confusing_lines (filevec); /* Now do the main comparison algorithm, considering just the undiscarded lines. */ xvec = filevec[0].undiscarded; yvec = filevec[1].undiscarded; diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; fdiag = (int *) malloc (diags * (2 * sizeof (int))); bdiag = fdiag + diags; fdiag += filevec[1].nondiscarded_lines + 1; bdiag += filevec[1].nondiscarded_lines + 1; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ too_expensive = 1; for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines; i != 0; i >>= 2) too_expensive <<= 1; too_expensive = max (256, too_expensive); files[0] = filevec[0]; files[1] = filevec[1]; compareseq (0, filevec[0].nondiscarded_lines, 0, filevec[1].nondiscarded_lines, no_discards); free (fdiag - (filevec[1].nondiscarded_lines + 1)); /* Modify the results slightly to make them prettier in cases where that can validly be done. */ shift_boundaries (filevec); /* Get the results of comparison in the form of a chain of `struct change's -- an edit script. */ build_script (filevec, callback, userData); free (filevec[0].undiscarded); free (filevec[0].changed_flag - 1); }
double fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) { struct context ctxt; int xvec_length = strlen (string1); int yvec_length = strlen (string2); int i; size_t fdiag_len; int *buffer; size_t bufmax; /* short-circuit obvious comparisons */ if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); if (lower_bound > 0) { /* Compute a quick upper bound. Each edit is an insertion or deletion of an element, hence modifies the length of the sequence by at most 1. Therefore, when starting from a sequence X and ending at a sequence Y, with N edits, | yvec_length - xvec_length | <= N. (Proof by induction over N.) So, at the end, we will have edit_count >= | xvec_length - yvec_length |. and hence result = (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) <= (xvec_length + yvec_length - | yvec_length - xvec_length |) / (xvec_length + yvec_length) = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). */ volatile double upper_bound = (double) (2 * MIN (xvec_length, yvec_length)) / (xvec_length + yvec_length); if (upper_bound < lower_bound) /* Prob: 74% */ /* Return an arbitrary value < LOWER_BOUND. */ return 0.0; #if CHAR_BIT <= 8 /* When X and Y are both small, avoid the overhead of setting up an array of size 256. */ if (xvec_length + yvec_length >= 20) /* Prob: 99% */ { /* Compute a less quick upper bound. Each edit is an insertion or deletion of a character, hence modifies the occurrence count of a character by 1 and leaves the other occurrence counts unchanged. Therefore, when starting from a sequence X and ending at a sequence Y, and denoting the occurrence count of C in X with OCC (X, C), with N edits, sum_C | OCC (X, C) - OCC (Y, C) | <= N. (Proof by induction over N.) So, at the end, we will have edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, and hence result = (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) / (xvec_length + yvec_length). */ int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ int sum; /* Determine the occurrence counts in X. */ memset (occ_diff, 0, sizeof (occ_diff)); for (i = xvec_length - 1; i >= 0; i--) occ_diff[(unsigned char) string1[i]]++; /* Subtract the occurrence counts in Y. */ for (i = yvec_length - 1; i >= 0; i--) occ_diff[(unsigned char) string2[i]]--; /* Sum up the absolute values. */ sum = 0; for (i = 0; i <= UCHAR_MAX; i++) { int d = occ_diff[i]; sum += (d >= 0 ? d : -d); } upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); if (upper_bound < lower_bound) /* Prob: 66% */ /* Return an arbitrary value < LOWER_BOUND. */ return 0.0; } #endif } /* set the info for each string. */ ctxt.xvec = string1; ctxt.yvec = string2; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ ctxt.too_expensive = 1; for (i = xvec_length + yvec_length; i != 0; i >>= 2) ctxt.too_expensive <<= 1; if (ctxt.too_expensive < 256) ctxt.too_expensive = 256; /* Allocate memory for fdiag and bdiag from a thread-local pool. */ fdiag_len = xvec_length + yvec_length + 3; gl_once (keys_init_once, keys_init); buffer = (int *) gl_tls_get (buffer_key); bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); if (fdiag_len > bufmax) { /* Need more memory. */ bufmax = 2 * bufmax; if (fdiag_len > bufmax) bufmax = fdiag_len; /* Calling xrealloc would be a waste: buffer's contents does not need to be preserved. */ if (buffer != NULL) free (buffer); buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); gl_tls_set (buffer_key, buffer); gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); } ctxt.fdiag = buffer + yvec_length + 1; ctxt.bdiag = ctxt.fdiag + fdiag_len; /* The edit_count is only ever increased. The computation can be aborted when (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) < lower_bound, or equivalently edit_count > (xvec_length + yvec_length) * (1 - lower_bound) or equivalently edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). We need to add an epsilon inside the floor(...) argument, to neutralize rounding errors. */ ctxt.edit_count_limit = (lower_bound < 1.0 ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001)) : 0); /* Now do the main comparison algorithm */ ctxt.edit_count = - ctxt.edit_count_limit; if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ /* The edit_count passed the limit. Hence the result would be < lower_bound. We can return any value < lower_bound instead. */ return 0.0; ctxt.edit_count += ctxt.edit_count_limit; /* The result is ((number of chars in common) / (average length of the strings)). The numerator is = xvec_length - (number of calls to NOTE_DELETE) = yvec_length - (number of calls to NOTE_INSERT) = 1/2 * (xvec_length + yvec_length - (number of edits)). This is admittedly biased towards finding that the strings are similar, however it does produce meaningful results. */ return ((double) (xvec_length + yvec_length - ctxt.edit_count) / (xvec_length + yvec_length)); }