Example #1
0
/* Find the first occurrence in the character string STRING of any character
   in the character string ACCEPT.  Return the number of bytes from the
   beginning of the string to this occurrence, or to the end of the string
   if none exists.  */
size_t
mbscspn (const char *string, const char *accept)
{
  /* Optimize two cases.  */
  if (accept[0] == '\0')
    return strlen (string);
  if (accept[1] == '\0')
    {
      const char *ptr = mbschr (string, accept[0]);
      return (ptr != NULL ? ptr - string : strlen (string));
    }
  /* General case.  */
  if (MB_CUR_MAX > 1)
    {
      mbui_iterator_t iter;

      for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
        {
          if (mb_len (mbui_cur (iter)) == 1)
            {
              if (mbschr (accept, * mbui_cur_ptr (iter)))
                goto found;
            }
          else
            {
              mbui_iterator_t aiter;

              for (mbui_init (aiter, accept);
                   mbui_avail (aiter);
                   mbui_advance (aiter))
                if (mb_equal (mbui_cur (aiter), mbui_cur (iter)))
                  goto found;
            }
        }
     found:
      return mbui_cur_ptr (iter) - string;
    }
  else
    return strcspn (string, accept);
}
Example #2
0
/* Find the first occurrence in the character string STRING of any character
   in the character string ACCEPT.  Return the pointer to it, or NULL if none
   exists.  */
char *
mbspbrk (const char *string, const char *accept)
{
  /* Optimize two cases.  */
  if (accept[0] == '\0')
    return NULL;
  if (accept[1] == '\0')
    return mbschr (string, accept[0]);
  /* General case.  */
  if (MB_CUR_MAX > 1)
    {
      mbui_iterator_t iter;

      for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
        {
          if (mb_len (mbui_cur (iter)) == 1)
            {
              if (mbschr (accept, * mbui_cur_ptr (iter)))
                return (char *) mbui_cur_ptr (iter);
            }
          else
            {
              mbui_iterator_t aiter;

              for (mbui_init (aiter, accept);
                   mbui_avail (aiter);
                   mbui_advance (aiter))
                if (mb_equal (mbui_cur (aiter), mbui_cur (iter)))
                  return (char *) mbui_cur_ptr (iter);
            }
        }
      return NULL;
    }
  else
    return strpbrk (string, accept);
}
Example #3
0
/* Knuth-Morris-Pratt algorithm.
   See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
   Return a boolean indicating success:
   Return true and set *RESULTP if the search was completed.
   Return false if it was aborted because not enough memory was available.  */
static bool
knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
                              const char **resultp)
{
  size_t m = mbslen (needle);
  mbchar_t *needle_mbchars;
  size_t *table;

  /* Allocate room for needle_mbchars and the table.  */
  char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
  if (memory == NULL)
    return false;
  needle_mbchars = (mbchar_t *) memory;
  table = (size_t *) (memory + m * sizeof (mbchar_t));

  /* Fill needle_mbchars.  */
  {
    mbui_iterator_t iter;
    size_t j;

    j = 0;
    for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
      mb_copy (&needle_mbchars[j], &mbui_cur (iter));
  }

  /* Fill the table.
     For 0 < i < m:
       0 < table[i] <= i is defined such that
       forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
       and table[i] is as large as possible with this property.
     This implies:
     1) For 0 < i < m:
          If table[i] < i,
          needle[table[i]..i-1] = needle[0..i-1-table[i]].
     2) For 0 < i < m:
          rhaystack[0..i-1] == needle[0..i-1]
          and exists h, i <= h < m: rhaystack[h] != needle[h]
          implies
          forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
     table[0] remains uninitialized.  */
  {
    size_t i, j;

    /* i = 1: Nothing to verify for x = 0.  */
    table[1] = 1;
    j = 0;

    for (i = 2; i < m; i++)
      {
        /* Here: j = i-1 - table[i-1].
           The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
           for x < table[i-1], by induction.
           Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
        mbchar_t *b = &needle_mbchars[i - 1];

        for (;;)
          {
            /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
               is known to hold for x < i-1-j.
               Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
            if (mb_equal (*b, needle_mbchars[j]))
              {
                /* Set table[i] := i-1-j.  */
                table[i] = i - ++j;
                break;
              }
            /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
               for x = i-1-j, because
                 needle[i-1] != needle[j] = needle[i-1-x].  */
            if (j == 0)
              {
                /* The inequality holds for all possible x.  */
                table[i] = i;
                break;
              }
            /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
               for i-1-j < x < i-1-j+table[j], because for these x:
                 needle[x..i-2]
                 = needle[x-(i-1-j)..j-1]
                 != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
                    = needle[0..i-2-x],
               hence needle[x..i-1] != needle[0..i-1-x].
               Furthermore
                 needle[i-1-j+table[j]..i-2]
                 = needle[table[j]..j-1]
                 = needle[0..j-1-table[j]]  (by definition of table[j]).  */
            j = j - table[j];
          }
        /* Here: j = i - table[i].  */
      }
  }

  /* Search, using the table to accelerate the processing.  */
  {
    size_t j;
    mbui_iterator_t rhaystack;
    mbui_iterator_t phaystack;

    *resultp = NULL;
    j = 0;
    mbui_init (rhaystack, haystack);
    mbui_init (phaystack, haystack);
    /* Invariant: phaystack = rhaystack + j.  */
    while (mbui_avail (phaystack))
      if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
        {
          j++;
          mbui_advance (phaystack);
          if (j == m)
            {
              /* The entire needle has been found.  */
              *resultp = mbui_cur_ptr (rhaystack);
              break;
            }
        }
      else if (j > 0)
        {
          /* Found a match of needle[0..j-1], mismatch at needle[j].  */
          size_t count = table[j];
          j -= count;
          for (; count > 0; count--)
            {
              if (!mbui_avail (rhaystack))
                abort ();
              mbui_advance (rhaystack);
            }
        }
      else
        {
          /* Found a mismatch at needle[0] already.  */
          if (!mbui_avail (rhaystack))
            abort ();
          mbui_advance (rhaystack);
          mbui_advance (phaystack);
        }
  }

  freea (memory);
  return true;
}
Example #4
0
/* Find the first occurrence of the character string NEEDLE in the character
   string HAYSTACK.  Return NULL if NEEDLE is not found in HAYSTACK.  */
char *
mbsstr (const char *haystack, const char *needle)
{
  /* Be careful not to look at the entire extent of haystack or needle
     until needed.  This is useful because of these two cases:
       - haystack may be very long, and a match of needle found early,
       - needle may be very long, and not even a short initial segment of
         needle may be found in haystack.  */
  if (MB_CUR_MAX > 1)
    {
      mbui_iterator_t iter_needle;

      mbui_init (iter_needle, needle);
      if (mbui_avail (iter_needle))
        {
          /* Minimizing the worst-case complexity:
             Let n = mbslen(haystack), m = mbslen(needle).
             The naïve algorithm is O(n*m) worst-case.
             The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
             memory allocation.
             To achieve linear complexity and yet amortize the cost of the
             memory allocation, we activate the Knuth-Morris-Pratt algorithm
             only once the naïve algorithm has already run for some time; more
             precisely, when
               - the outer loop count is >= 10,
               - the average number of comparisons per outer loop is >= 5,
               - the total number of comparisons is >= m.
             But we try it only once.  If the memory allocation attempt failed,
             we don't retry it.  */
          bool try_kmp = true;
          size_t outer_loop_count = 0;
          size_t comparison_count = 0;
          size_t last_ccount = 0;                  /* last comparison count */
          mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */

          mbui_iterator_t iter_haystack;

          mbui_init (iter_needle_last_ccount, needle);
          mbui_init (iter_haystack, haystack);
          for (;; mbui_advance (iter_haystack))
            {
              if (!mbui_avail (iter_haystack))
                /* No match.  */
                return NULL;

              /* See whether it's advisable to use an asymptotically faster
                 algorithm.  */
              if (try_kmp
                  && outer_loop_count >= 10
                  && comparison_count >= 5 * outer_loop_count)
                {
                  /* See if needle + comparison_count now reaches the end of
                     needle.  */
                  size_t count = comparison_count - last_ccount;
                  for (;
                       count > 0 && mbui_avail (iter_needle_last_ccount);
                       count--)
                    mbui_advance (iter_needle_last_ccount);
                  last_ccount = comparison_count;
                  if (!mbui_avail (iter_needle_last_ccount))
                    {
                      /* Try the Knuth-Morris-Pratt algorithm.  */
                      const char *result;
                      bool success =
                        knuth_morris_pratt_multibyte (haystack, needle,
                                                      &result);
                      if (success)
                        return (char *) result;
                      try_kmp = false;
                    }
                }

              outer_loop_count++;
              comparison_count++;
              if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
                /* The first character matches.  */
                {
                  mbui_iterator_t rhaystack;
                  mbui_iterator_t rneedle;

                  memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
                  mbui_advance (rhaystack);

                  mbui_init (rneedle, needle);
                  if (!mbui_avail (rneedle))
                    abort ();
                  mbui_advance (rneedle);

                  for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
                    {
                      if (!mbui_avail (rneedle))
                        /* Found a match.  */
                        return (char *) mbui_cur_ptr (iter_haystack);
                      if (!mbui_avail (rhaystack))
                        /* No match.  */
                        return NULL;
                      comparison_count++;
                      if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
                        /* Nothing in this round.  */
                        break;
                    }
                }
            }
        }
      else
        return (char *) haystack;
    }
  else
    {
      if (*needle != '\0')
        {
          /* Minimizing the worst-case complexity:
             Let n = strlen(haystack), m = strlen(needle).
             The naïve algorithm is O(n*m) worst-case.
             The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
             memory allocation.
             To achieve linear complexity and yet amortize the cost of the
             memory allocation, we activate the Knuth-Morris-Pratt algorithm
             only once the naïve algorithm has already run for some time; more
             precisely, when
               - the outer loop count is >= 10,
               - the average number of comparisons per outer loop is >= 5,
               - the total number of comparisons is >= m.
             But we try it only once.  If the memory allocation attempt failed,
             we don't retry it.  */
          bool try_kmp = true;
          size_t outer_loop_count = 0;
          size_t comparison_count = 0;
          size_t last_ccount = 0;                  /* last comparison count */
          const char *needle_last_ccount = needle; /* = needle + last_ccount */

          /* Speed up the following searches of needle by caching its first
             character.  */
          char b = *needle++;

          for (;; haystack++)
            {
              if (*haystack == '\0')
                /* No match.  */
                return NULL;

              /* See whether it's advisable to use an asymptotically faster
                 algorithm.  */
              if (try_kmp
                  && outer_loop_count >= 10
                  && comparison_count >= 5 * outer_loop_count)
                {
                  /* See if needle + comparison_count now reaches the end of
                     needle.  */
                  if (needle_last_ccount != NULL)
                    {
                      needle_last_ccount +=
                        strnlen (needle_last_ccount,
                                 comparison_count - last_ccount);
                      if (*needle_last_ccount == '\0')
                        needle_last_ccount = NULL;
                      last_ccount = comparison_count;
                    }
                  if (needle_last_ccount == NULL)
                    {
                      /* Try the Knuth-Morris-Pratt algorithm.  */
                      const char *result;
                      bool success =
                        knuth_morris_pratt_unibyte (haystack, needle - 1,
                                                    &result);
                      if (success)
                        return (char *) result;
                      try_kmp = false;
                    }
                }

              outer_loop_count++;
              comparison_count++;
              if (*haystack == b)
                /* The first character matches.  */
                {
                  const char *rhaystack = haystack + 1;
                  const char *rneedle = needle;

                  for (;; rhaystack++, rneedle++)
                    {
                      if (*rneedle == '\0')
                        /* Found a match.  */
                        return (char *) haystack;
                      if (*rhaystack == '\0')
                        /* No match.  */
                        return NULL;
                      comparison_count++;
                      if (*rhaystack != *rneedle)
                        /* Nothing in this round.  */
                        break;
                    }
                }
            }
        }
      else
        return (char *) haystack;
    }
}
Example #5
0
static bool
knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
			      const char **resultp)
{
  size_t m = mbslen (needle);
  mbchar_t *needle_mbchars;
  size_t *table;

  /* Allocate room for needle_mbchars and the table.  */
  char *memory = (char *) malloca (m * (sizeof (mbchar_t) + sizeof (size_t)));
  if (memory == NULL)
    return false;
  needle_mbchars = (mbchar_t *) memory;
  table = (size_t *) (memory + m * sizeof (mbchar_t));

  /* Fill needle_mbchars.  */
  {
    mbui_iterator_t iter;
    size_t j;

    j = 0;
    for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
      mb_copy (&needle_mbchars[j], &mbui_cur (iter));
  }

  /* Fill the table.
     For 0 < i < m:
       0 < table[i] <= i is defined such that
       rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i]
       implies
       forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1],
       and table[i] is as large as possible with this property.
     table[0] remains uninitialized.  */
  {
    size_t i, j;

    table[1] = 1;
    j = 0;
    for (i = 2; i < m; i++)
      {
	mbchar_t *b = &needle_mbchars[i - 1];

	for (;;)
	  {
	    if (mb_equal (*b, needle_mbchars[j]))
	      {
		table[i] = i - ++j;
		break;
	      }
	    if (j == 0)
	      {
		table[i] = i;
		break;
	      }
	    j = j - table[j];
	  }
      }
  }

  /* Search, using the table to accelerate the processing.  */
  {
    size_t j;
    mbui_iterator_t rhaystack;
    mbui_iterator_t phaystack;

    *resultp = NULL;
    j = 0;
    mbui_init (rhaystack, haystack);
    mbui_init (phaystack, haystack);
    /* Invariant: phaystack = rhaystack + j.  */
    while (mbui_avail (phaystack))
      if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
	{
	  j++;
	  mbui_advance (phaystack);
	  if (j == m)
	    {
	      /* The entire needle has been found.  */
	      *resultp = mbui_cur_ptr (rhaystack);
	      break;
	    }
	}
      else if (j > 0)
	{
	  /* Found a match of needle[0..j-1], mismatch at needle[j].  */
	  size_t count = table[j];
	  j -= count;
	  for (; count > 0; count--)
	    {
	      if (!mbui_avail (rhaystack))
		abort ();
	      mbui_advance (rhaystack);
	    }
	}
      else
	{
	  /* Found a mismatch at needle[0] already.  */
	  if (!mbui_avail (rhaystack))
	    abort ();
	  mbui_advance (rhaystack);
	  mbui_advance (phaystack);
	}
  }

  freea (memory);
  return true;
}
Example #6
0
/* Find the first occurrence of NEEDLE in HAYSTACK.  */
char *
strstr (const char *haystack, const char *needle)
{
  /* Be careful not to look at the entire extent of haystack or needle
     until needed.  This is useful because of these two cases:
       - haystack may be very long, and a match of needle found early,
       - needle may be very long, and not even a short initial segment of
         needle may be found in haystack.  */
#if HAVE_MBRTOWC
  if (MB_CUR_MAX > 1)
    {
      mbui_iterator_t iter_needle;

      mbui_init (iter_needle, needle);
      if (mbui_avail (iter_needle))
	{
	  mbui_iterator_t iter_haystack;

	  mbui_init (iter_haystack, haystack);
	  for (;; mbui_advance (iter_haystack))
	    {
	      if (!mbui_avail (iter_haystack))
		/* No match.  */
		return NULL;

	      if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
		/* The first character matches.  */
		{
		  mbui_iterator_t rhaystack;
		  mbui_iterator_t rneedle;

		  memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
		  mbui_advance (rhaystack);

		  mbui_init (rneedle, needle);
		  if (!mbui_avail (rneedle))
		    abort ();
		  mbui_advance (rneedle);

		  for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
		    {
		      if (!mbui_avail (rneedle))
			/* Found a match.  */
			return (char *) mbui_cur_ptr (iter_haystack);
		      if (!mbui_avail (rhaystack))
			/* No match.  */
			return NULL;
		      if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
			/* Nothing in this round.  */
			break;
		    }
		}
	    }
	}
      else
	return (char *) haystack;
    }
  else
#endif
    {
      if (*needle != '\0')
	{
	  /* Speed up the following searches of needle by caching its first
	     character.  */
	  char b = *needle++;

	  for (;; haystack++)
	    {
	      if (*haystack == '\0')
		/* No match.  */
		return NULL;
	      if (*haystack == b)
		/* The first character matches.  */
		{
		  const char *rhaystack = haystack + 1;
		  const char *rneedle = needle;

		  for (;; rhaystack++, rneedle++)
		    {
		      if (*rneedle == '\0')
			/* Found a match.  */
			return (char *) haystack;
		      if (*rhaystack == '\0')
			/* No match.  */
			return NULL;
		      if (*rhaystack != *rneedle)
			/* Nothing in this round.  */
			break;
		    }
		}
	    }
	}
      else
	return (char *) haystack;
    }
}
Example #7
0
/* Find the first occurrence in the character string STRING of any character
   not in the character string REJECT.  Return the number of bytes from the
   beginning of the string to this occurrence, or to the end of the string
   if none exists.  */
size_t
mbsspn (const char *string, const char *reject)
{
  /* Optimize two cases.  */
  if (reject[0] == '\0')
    return 0;
  if (reject[1] == '\0')
    {
      unsigned char uc = (unsigned char) reject[0];

      if (MB_CUR_MAX > 1)
        {
          mbui_iterator_t iter;

          for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
            if (!(mb_len (mbui_cur (iter)) == 1
                  && (unsigned char) * mbui_cur_ptr (iter) == uc))
              break;
          return mbui_cur_ptr (iter) - string;
        }
      else
        {
          const char *ptr;

          for (ptr = string; *ptr != '\0'; ptr++)
            if ((unsigned char) *ptr != uc)
              break;
          return ptr - string;
        }
    }
  /* General case.  */
  if (MB_CUR_MAX > 1)
    {
      mbui_iterator_t iter;

      for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
        {
          if (mb_len (mbui_cur (iter)) == 1)
            {
              if (mbschr (reject, * mbui_cur_ptr (iter)) == NULL)
                goto found;
            }
          else
            {
              mbui_iterator_t aiter;

              for (mbui_init (aiter, reject);; mbui_advance (aiter))
                {
                  if (!mbui_avail (aiter))
                    goto found;
                  if (mb_equal (mbui_cur (aiter), mbui_cur (iter)))
                    break;
                }
            }
        }
     found:
      return mbui_cur_ptr (iter) - string;
    }
  else
    return strspn (string, reject);
}