static void
compareseq (int xoff, int xlim, int yoff, int ylim, int minimal)
{
  int * const xv = xvec; /* Help the compiler.  */
  int * const yv = yvec;

  /* Slide down the bottom initial diagonal. */
  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
    ++xoff, ++yoff;
  /* Slide up the top initial diagonal. */
  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
    --xlim, --ylim;

  /* Handle simple cases. */
  if (xoff == xlim)
    while (yoff < ylim)
      files[1].changed_flag[files[1].realindexes[yoff++]] = 1;
  else if (yoff == ylim)
    while (xoff < xlim)
      files[0].changed_flag[files[0].realindexes[xoff++]] = 1;
  else
    {
      int c;
      struct partition part;

      /* Find a point of correspondence in the middle of the files.  */

      c = diag (xoff, xlim, yoff, ylim, minimal, &part);

      if (c == 1)
	{
	  /* This should be impossible, because it implies that
	     one of the two subsequences is empty,
	     and that case was handled above without calling `diag'.
	     Let's verify that this is true.  */
	  abort ();
#if 0
	  /* The two subsequences differ by a single insert or delete;
	     record it and we are done.  */
	  if (part.xmid - part.ymid < xoff - yoff)
	    files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1;
	  else
	    files[0].changed_flag[files[0].realindexes[part.xmid]] = 1;
#endif
	}
      else
	{
	  /* Use the partitions to split this problem into subproblems.  */
	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
	}
    }
}
double
fstrcmp (const char *string1, const char *string2)
{
  int i;

  size_t fdiag_len;
  static int *fdiag_buf;
  static size_t fdiag_max;

  /* set the info for each string.  */
  string[0].data = string1;
  string[0].data_length = strlen (string1);
  string[1].data = string2;
  string[1].data_length = strlen (string2);

  /* short-circuit obvious comparisons */
  if (string[0].data_length == 0 && string[1].data_length == 0)
    return 1.0;
  if (string[0].data_length == 0 || string[1].data_length == 0)
    return 0.0;

  /* Set TOO_EXPENSIVE to be approximate square root of input size,
     bounded below by 256.  */
  too_expensive = 1;
  for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
    too_expensive <<= 1;
  if (too_expensive < 256)
    too_expensive = 256;

  /* Because fstrcmp is typically called multiple times, while scanning
     symbol tables, etc, attempt to minimize the number of memory
     allocations performed.  Thus, we use a static buffer for the
     diagonal vectors, and never free them.  */
  fdiag_len = string[0].data_length + string[1].data_length + 3;
  if (fdiag_len > fdiag_max)
    {
      fdiag_max = fdiag_len;
      fdiag_buf = xrealloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
    }
  fdiag = fdiag_buf + string[1].data_length + 1;
  bdiag = fdiag + fdiag_len;

  /* Now do the main comparison algorithm */
  string[0].edit_count = 0;
  string[1].edit_count = 0;
  compareseq (0, string[0].data_length, 0, string[1].data_length, 0);

  /* The result is
	((number of chars in common) / (average length of the strings)).
     This is admittedly biased towards finding that the strings are
     similar, however it does produce meaningful results.  */
  return ((double) (string[0].data_length + string[1].data_length
		    - string[1].edit_count - string[0].edit_count)
	  / (string[0].data_length + string[1].data_length));
}
/* Report the differences of two files.  */
void
diff_2_files (struct file_data filevec[], add_change_callback callback, void* userData)
{
  int diags;
  int i;

      /* Allocate vectors for the results of comparison:
	 a flag for each line of each file, saying whether that line
	 is an insertion or deletion.
	 Allocate an extra element, always zero, at each end of each vector.  */

      size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4;
      filevec[0].changed_flag = (char *)malloc (s);
      bzero (filevec[0].changed_flag, s);
      filevec[0].changed_flag++;
      filevec[1].changed_flag = filevec[0].changed_flag
				+ filevec[0].buffered_lines + 2;

      /* Some lines are obviously insertions or deletions
	 because they don't match anything.  Detect them now, and
	 avoid even thinking about them in the main comparison algorithm.  */

      discard_confusing_lines (filevec);

      /* Now do the main comparison algorithm, considering just the
	 undiscarded lines.  */

      xvec = filevec[0].undiscarded;
      yvec = filevec[1].undiscarded;
      diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
      fdiag = (int *) malloc (diags * (2 * sizeof (int)));
      bdiag = fdiag + diags;
      fdiag += filevec[1].nondiscarded_lines + 1;
      bdiag += filevec[1].nondiscarded_lines + 1;

      /* Set TOO_EXPENSIVE to be approximate square root of input size,
	 bounded below by 256.  */
      too_expensive = 1;
      for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines;
	   i != 0; i >>= 2)
	too_expensive <<= 1;
      too_expensive = max (256, too_expensive);

      files[0] = filevec[0];
      files[1] = filevec[1];

      compareseq (0, filevec[0].nondiscarded_lines,
		  0, filevec[1].nondiscarded_lines, no_discards);

      free (fdiag - (filevec[1].nondiscarded_lines + 1));

      /* Modify the results slightly to make them prettier
	 in cases where that can validly be done.  */

      shift_boundaries (filevec);

      /* Get the results of comparison in the form of a chain
	 of `struct change's -- an edit script.  */

	  build_script (filevec, callback, userData);

      free (filevec[0].undiscarded);

      free (filevec[0].changed_flag - 1);
}
Beispiel #4
0
double
fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
{
  struct context ctxt;
  int xvec_length = strlen (string1);
  int yvec_length = strlen (string2);
  int i;

  size_t fdiag_len;
  int *buffer;
  size_t bufmax;

  /* short-circuit obvious comparisons */
  if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */
    return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0);

  if (lower_bound > 0)
    {
      /* Compute a quick upper bound.
         Each edit is an insertion or deletion of an element, hence modifies
         the length of the sequence by at most 1.
         Therefore, when starting from a sequence X and ending at a sequence Y,
         with N edits,  | yvec_length - xvec_length | <= N.  (Proof by
         induction over N.)
         So, at the end, we will have
           edit_count >= | xvec_length - yvec_length |.
         and hence
           result
             = (xvec_length + yvec_length - edit_count)
               / (xvec_length + yvec_length)
             <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
                / (xvec_length + yvec_length)
             = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
       */
      volatile double upper_bound =
        (double) (2 * MIN (xvec_length, yvec_length))
        / (xvec_length + yvec_length);

      if (upper_bound < lower_bound) /* Prob: 74% */
        /* Return an arbitrary value < LOWER_BOUND.  */
        return 0.0;

#if CHAR_BIT <= 8
      /* When X and Y are both small, avoid the overhead of setting up an
         array of size 256.  */
      if (xvec_length + yvec_length >= 20) /* Prob: 99% */
        {
          /* Compute a less quick upper bound.
             Each edit is an insertion or deletion of a character, hence
             modifies the occurrence count of a character by 1 and leaves the
             other occurrence counts unchanged.
             Therefore, when starting from a sequence X and ending at a
             sequence Y, and denoting the occurrence count of C in X with
             OCC (X, C), with N edits,
               sum_C | OCC (X, C) - OCC (Y, C) | <= N.
             (Proof by induction over N.)
             So, at the end, we will have
               edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |,
             and hence
               result
                 = (xvec_length + yvec_length - edit_count)
                   / (xvec_length + yvec_length)
                 <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |)
                    / (xvec_length + yvec_length).
           */
          int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */
          int sum;

          /* Determine the occurrence counts in X.  */
          memset (occ_diff, 0, sizeof (occ_diff));
          for (i = xvec_length - 1; i >= 0; i--)
            occ_diff[(unsigned char) string1[i]]++;
          /* Subtract the occurrence counts in Y.  */
          for (i = yvec_length - 1; i >= 0; i--)
            occ_diff[(unsigned char) string2[i]]--;
          /* Sum up the absolute values.  */
          sum = 0;
          for (i = 0; i <= UCHAR_MAX; i++)
            {
              int d = occ_diff[i];
              sum += (d >= 0 ? d : -d);
            }

          upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length);

          if (upper_bound < lower_bound) /* Prob: 66% */
            /* Return an arbitrary value < LOWER_BOUND.  */
            return 0.0;
        }
#endif
    }

  /* set the info for each string.  */
  ctxt.xvec = string1;
  ctxt.yvec = string2;

  /* Set TOO_EXPENSIVE to be approximate square root of input size,
     bounded below by 256.  */
  ctxt.too_expensive = 1;
  for (i = xvec_length + yvec_length;
       i != 0;
       i >>= 2)
    ctxt.too_expensive <<= 1;
  if (ctxt.too_expensive < 256)
    ctxt.too_expensive = 256;

  /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
  fdiag_len = xvec_length + yvec_length + 3;
  gl_once (keys_init_once, keys_init);
  buffer = (int *) gl_tls_get (buffer_key);
  bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
  if (fdiag_len > bufmax)
    {
      /* Need more memory.  */
      bufmax = 2 * bufmax;
      if (fdiag_len > bufmax)
        bufmax = fdiag_len;
      /* Calling xrealloc would be a waste: buffer's contents does not need
         to be preserved.  */
      if (buffer != NULL)
        free (buffer);
      buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int));
      gl_tls_set (buffer_key, buffer);
      gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
    }
  ctxt.fdiag = buffer + yvec_length + 1;
  ctxt.bdiag = ctxt.fdiag + fdiag_len;

  /* The edit_count is only ever increased.  The computation can be aborted
     when
       (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length)
       < lower_bound,
     or equivalently
       edit_count > (xvec_length + yvec_length) * (1 - lower_bound)
     or equivalently
       edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)).
     We need to add an epsilon inside the floor(...) argument, to neutralize
     rounding errors.  */
  ctxt.edit_count_limit =
    (lower_bound < 1.0
     ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001))
     : 0);

  /* Now do the main comparison algorithm */
  ctxt.edit_count = - ctxt.edit_count_limit;
  if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */
    /* The edit_count passed the limit.  Hence the result would be
       < lower_bound.  We can return any value < lower_bound instead.  */
    return 0.0;
  ctxt.edit_count += ctxt.edit_count_limit;

  /* The result is
        ((number of chars in common) / (average length of the strings)).
     The numerator is
        = xvec_length - (number of calls to NOTE_DELETE)
        = yvec_length - (number of calls to NOTE_INSERT)
        = 1/2 * (xvec_length + yvec_length - (number of edits)).
     This is admittedly biased towards finding that the strings are
     similar, however it does produce meaningful results.  */
  return ((double) (xvec_length + yvec_length - ctxt.edit_count)
          / (xvec_length + yvec_length));
}