/* Compare the initial segment of the character string STRING consisting of at most mbslen (PREFIX) characters with the character string PREFIX, ignoring case. If the two match, return a pointer to the first byte after this prefix in STRING. Otherwise, return NULL. Note: This function may, in multibyte locales, return non-NULL if STRING is of smaller length than PREFIX! */ char * mbspcasecmp (const char *string, const char *prefix) { /* This is essentially the same as mbsncasecmp (string, prefix, mbslen (prefix)) just with small optimizations. */ if (string == prefix) return (char *) (string + strlen (string)); /* Be careful not to look at the entire extent of STRING or PREFIX until needed. This is useful because when two strings differ, the difference is most often already in the very few first characters. */ if (MB_CUR_MAX > 1) { mbui_iterator_t iter1; mbui_iterator_t iter2; mbui_init (iter1, string); mbui_init (iter2, prefix); while (mbui_avail (iter1) && mbui_avail (iter2)) { int cmp = mb_casecmp (mbui_cur (iter1), mbui_cur (iter2)); if (cmp != 0) return NULL; mbui_advance (iter1); mbui_advance (iter2); } if (!mbui_avail (iter2)) /* PREFIX equals STRING or is terminated before STRING. */ return (char *) mbui_cur_ptr (iter1); else /* STRING terminated before PREFIX. */ return NULL; } else { const unsigned char *p1 = (const unsigned char *) string; const unsigned char *p2 = (const unsigned char *) prefix; unsigned char c1, c2; for (; ; p1++, p2++) { c1 = TOLOWER (*p1); c2 = TOLOWER (*p2); if (c2 == '\0' || c1 != c2) break; } if (c2 == '\0') /* PREFIX equals STRING or is terminated before STRING. */ return (char *) p1; else /* STRING terminated before PREFIX. */ return NULL; } }
/* Locate the first single-byte character C in the character string STRING, and return a pointer to it. Return NULL if C is not found in STRING. */ char * mbschr (const char *string, int c) { if (MB_CUR_MAX > 1 /* Optimization: We know that ASCII characters < 0x30 don't occur as part of multibyte characters longer than 1 byte. Hence, if c < 0x30, the faster unibyte loop can be used. */ && (unsigned char) c >= 0x30) { mbui_iterator_t iter; for (mbui_init (iter, string);; mbui_advance (iter)) { if (!mbui_avail (iter)) goto notfound; if (mb_len (mbui_cur (iter)) == 1 && (unsigned char) * mbui_cur_ptr (iter) == (unsigned char) c) break; } return (char *) mbui_cur_ptr (iter); notfound: return NULL; } else return strchr (string, c); }
/* Find the first occurrence in the character string STRING of any character in the character string ACCEPT. Return the number of bytes from the beginning of the string to this occurrence, or to the end of the string if none exists. */ size_t mbscspn (const char *string, const char *accept) { /* Optimize two cases. */ if (accept[0] == '\0') return strlen (string); if (accept[1] == '\0') { const char *ptr = mbschr (string, accept[0]); return (ptr != NULL ? ptr - string : strlen (string)); } /* General case. */ if (MB_CUR_MAX > 1) { mbui_iterator_t iter; for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) { if (mb_len (mbui_cur (iter)) == 1) { if (mbschr (accept, * mbui_cur_ptr (iter))) goto found; } else { mbui_iterator_t aiter; for (mbui_init (aiter, accept); mbui_avail (aiter); mbui_advance (aiter)) if (mb_equal (mbui_cur (aiter), mbui_cur (iter))) goto found; } } found: return mbui_cur_ptr (iter) - string; } else return strcspn (string, accept); }
/* Find the first occurrence in the character string STRING of any character in the character string ACCEPT. Return the pointer to it, or NULL if none exists. */ char * mbspbrk (const char *string, const char *accept) { /* Optimize two cases. */ if (accept[0] == '\0') return NULL; if (accept[1] == '\0') return mbschr (string, accept[0]); /* General case. */ if (MB_CUR_MAX > 1) { mbui_iterator_t iter; for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) { if (mb_len (mbui_cur (iter)) == 1) { if (mbschr (accept, * mbui_cur_ptr (iter))) return (char *) mbui_cur_ptr (iter); } else { mbui_iterator_t aiter; for (mbui_init (aiter, accept); mbui_avail (aiter); mbui_advance (aiter)) if (mb_equal (mbui_cur (aiter), mbui_cur (iter))) return (char *) mbui_cur_ptr (iter); } } return NULL; } else return strpbrk (string, accept); }
/* Ditto, for case-insensitive hashes */ static size_t string_hasher_ci (void const *data, size_t n_buckets) { char const *p = data; mbui_iterator_t iter; size_t value = 0; for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter)) { mbchar_t m = mbui_cur (iter); wchar_t wc; if (m.wc_valid) wc = towlower (m.wc); else wc = *m.ptr; value = (value * 31 + wc) % n_buckets; } return value; }
/* Knuth-Morris-Pratt algorithm. See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm Return a boolean indicating success: Return true and set *RESULTP if the search was completed. Return false if it was aborted because not enough memory was available. */ static bool knuth_morris_pratt_multibyte (const char *haystack, const char *needle, const char **resultp) { size_t m = mbslen (needle); mbchar_t *needle_mbchars; size_t *table; /* Allocate room for needle_mbchars and the table. */ char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t)); if (memory == NULL) return false; needle_mbchars = (mbchar_t *) memory; table = (size_t *) (memory + m * sizeof (mbchar_t)); /* Fill needle_mbchars. */ { mbui_iterator_t iter; size_t j; j = 0; for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) mb_copy (&needle_mbchars[j], &mbui_cur (iter)); } /* Fill the table. For 0 < i < m: 0 < table[i] <= i is defined such that forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], and table[i] is as large as possible with this property. This implies: 1) For 0 < i < m: If table[i] < i, needle[table[i]..i-1] = needle[0..i-1-table[i]]. 2) For 0 < i < m: rhaystack[0..i-1] == needle[0..i-1] and exists h, i <= h < m: rhaystack[h] != needle[h] implies forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. table[0] remains uninitialized. */ { size_t i, j; /* i = 1: Nothing to verify for x = 0. */ table[1] = 1; j = 0; for (i = 2; i < m; i++) { /* Here: j = i-1 - table[i-1]. The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold for x < table[i-1], by induction. Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ mbchar_t *b = &needle_mbchars[i - 1]; for (;;) { /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold for x < i-1-j. Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ if (mb_equal (*b, needle_mbchars[j])) { /* Set table[i] := i-1-j. */ table[i] = i - ++j; break; } /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds for x = i-1-j, because needle[i-1] != needle[j] = needle[i-1-x]. */ if (j == 0) { /* The inequality holds for all possible x. */ table[i] = i; break; } /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds for i-1-j < x < i-1-j+table[j], because for these x: needle[x..i-2] = needle[x-(i-1-j)..j-1] != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) = needle[0..i-2-x], hence needle[x..i-1] != needle[0..i-1-x]. Furthermore needle[i-1-j+table[j]..i-2] = needle[table[j]..j-1] = needle[0..j-1-table[j]] (by definition of table[j]). */ j = j - table[j]; } /* Here: j = i - table[i]. */ } } /* Search, using the table to accelerate the processing. */ { size_t j; mbui_iterator_t rhaystack; mbui_iterator_t phaystack; *resultp = NULL; j = 0; mbui_init (rhaystack, haystack); mbui_init (phaystack, haystack); /* Invariant: phaystack = rhaystack + j. */ while (mbui_avail (phaystack)) if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) { j++; mbui_advance (phaystack); if (j == m) { /* The entire needle has been found. */ *resultp = mbui_cur_ptr (rhaystack); break; } } else if (j > 0) { /* Found a match of needle[0..j-1], mismatch at needle[j]. */ size_t count = table[j]; j -= count; for (; count > 0; count--) { if (!mbui_avail (rhaystack)) abort (); mbui_advance (rhaystack); } } else { /* Found a mismatch at needle[0] already. */ if (!mbui_avail (rhaystack)) abort (); mbui_advance (rhaystack); mbui_advance (phaystack); } } freea (memory); return true; }
/* Find the first occurrence of the character string NEEDLE in the character string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */ char * mbsstr (const char *haystack, const char *needle) { /* Be careful not to look at the entire extent of haystack or needle until needed. This is useful because of these two cases: - haystack may be very long, and a match of needle found early, - needle may be very long, and not even a short initial segment of needle may be found in haystack. */ if (MB_CUR_MAX > 1) { mbui_iterator_t iter_needle; mbui_init (iter_needle, needle); if (mbui_avail (iter_needle)) { /* Minimizing the worst-case complexity: Let n = mbslen(haystack), m = mbslen(needle). The naïve algorithm is O(n*m) worst-case. The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a memory allocation. To achieve linear complexity and yet amortize the cost of the memory allocation, we activate the Knuth-Morris-Pratt algorithm only once the naïve algorithm has already run for some time; more precisely, when - the outer loop count is >= 10, - the average number of comparisons per outer loop is >= 5, - the total number of comparisons is >= m. But we try it only once. If the memory allocation attempt failed, we don't retry it. */ bool try_kmp = true; size_t outer_loop_count = 0; size_t comparison_count = 0; size_t last_ccount = 0; /* last comparison count */ mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */ mbui_iterator_t iter_haystack; mbui_init (iter_needle_last_ccount, needle); mbui_init (iter_haystack, haystack); for (;; mbui_advance (iter_haystack)) { if (!mbui_avail (iter_haystack)) /* No match. */ return NULL; /* See whether it's advisable to use an asymptotically faster algorithm. */ if (try_kmp && outer_loop_count >= 10 && comparison_count >= 5 * outer_loop_count) { /* See if needle + comparison_count now reaches the end of needle. */ size_t count = comparison_count - last_ccount; for (; count > 0 && mbui_avail (iter_needle_last_ccount); count--) mbui_advance (iter_needle_last_ccount); last_ccount = comparison_count; if (!mbui_avail (iter_needle_last_ccount)) { /* Try the Knuth-Morris-Pratt algorithm. */ const char *result; bool success = knuth_morris_pratt_multibyte (haystack, needle, &result); if (success) return (char *) result; try_kmp = false; } } outer_loop_count++; comparison_count++; if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) /* The first character matches. */ { mbui_iterator_t rhaystack; mbui_iterator_t rneedle; memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); mbui_advance (rhaystack); mbui_init (rneedle, needle); if (!mbui_avail (rneedle)) abort (); mbui_advance (rneedle); for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) { if (!mbui_avail (rneedle)) /* Found a match. */ return (char *) mbui_cur_ptr (iter_haystack); if (!mbui_avail (rhaystack)) /* No match. */ return NULL; comparison_count++; if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) /* Nothing in this round. */ break; } } } } else return (char *) haystack; } else { if (*needle != '\0') { /* Minimizing the worst-case complexity: Let n = strlen(haystack), m = strlen(needle). The naïve algorithm is O(n*m) worst-case. The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a memory allocation. To achieve linear complexity and yet amortize the cost of the memory allocation, we activate the Knuth-Morris-Pratt algorithm only once the naïve algorithm has already run for some time; more precisely, when - the outer loop count is >= 10, - the average number of comparisons per outer loop is >= 5, - the total number of comparisons is >= m. But we try it only once. If the memory allocation attempt failed, we don't retry it. */ bool try_kmp = true; size_t outer_loop_count = 0; size_t comparison_count = 0; size_t last_ccount = 0; /* last comparison count */ const char *needle_last_ccount = needle; /* = needle + last_ccount */ /* Speed up the following searches of needle by caching its first character. */ char b = *needle++; for (;; haystack++) { if (*haystack == '\0') /* No match. */ return NULL; /* See whether it's advisable to use an asymptotically faster algorithm. */ if (try_kmp && outer_loop_count >= 10 && comparison_count >= 5 * outer_loop_count) { /* See if needle + comparison_count now reaches the end of needle. */ if (needle_last_ccount != NULL) { needle_last_ccount += strnlen (needle_last_ccount, comparison_count - last_ccount); if (*needle_last_ccount == '\0') needle_last_ccount = NULL; last_ccount = comparison_count; } if (needle_last_ccount == NULL) { /* Try the Knuth-Morris-Pratt algorithm. */ const char *result; bool success = knuth_morris_pratt_unibyte (haystack, needle - 1, &result); if (success) return (char *) result; try_kmp = false; } } outer_loop_count++; comparison_count++; if (*haystack == b) /* The first character matches. */ { const char *rhaystack = haystack + 1; const char *rneedle = needle; for (;; rhaystack++, rneedle++) { if (*rneedle == '\0') /* Found a match. */ return (char *) haystack; if (*rhaystack == '\0') /* No match. */ return NULL; comparison_count++; if (*rhaystack != *rneedle) /* Nothing in this round. */ break; } } } } else return (char *) haystack; } }
static bool knuth_morris_pratt_multibyte (const char *haystack, const char *needle, const char **resultp) { size_t m = mbslen (needle); mbchar_t *needle_mbchars; size_t *table; /* Allocate room for needle_mbchars and the table. */ char *memory = (char *) malloca (m * (sizeof (mbchar_t) + sizeof (size_t))); if (memory == NULL) return false; needle_mbchars = (mbchar_t *) memory; table = (size_t *) (memory + m * sizeof (mbchar_t)); /* Fill needle_mbchars. */ { mbui_iterator_t iter; size_t j; j = 0; for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) mb_copy (&needle_mbchars[j], &mbui_cur (iter)); } /* Fill the table. For 0 < i < m: 0 < table[i] <= i is defined such that rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] implies forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], and table[i] is as large as possible with this property. table[0] remains uninitialized. */ { size_t i, j; table[1] = 1; j = 0; for (i = 2; i < m; i++) { mbchar_t *b = &needle_mbchars[i - 1]; for (;;) { if (mb_equal (*b, needle_mbchars[j])) { table[i] = i - ++j; break; } if (j == 0) { table[i] = i; break; } j = j - table[j]; } } } /* Search, using the table to accelerate the processing. */ { size_t j; mbui_iterator_t rhaystack; mbui_iterator_t phaystack; *resultp = NULL; j = 0; mbui_init (rhaystack, haystack); mbui_init (phaystack, haystack); /* Invariant: phaystack = rhaystack + j. */ while (mbui_avail (phaystack)) if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) { j++; mbui_advance (phaystack); if (j == m) { /* The entire needle has been found. */ *resultp = mbui_cur_ptr (rhaystack); break; } } else if (j > 0) { /* Found a match of needle[0..j-1], mismatch at needle[j]. */ size_t count = table[j]; j -= count; for (; count > 0; count--) { if (!mbui_avail (rhaystack)) abort (); mbui_advance (rhaystack); } } else { /* Found a mismatch at needle[0] already. */ if (!mbui_avail (rhaystack)) abort (); mbui_advance (rhaystack); mbui_advance (phaystack); } } freea (memory); return true; }
/* Compare strings S1 and S2, ignoring case, returning less than, equal to or greater than zero if S1 is lexicographically less than, equal to or greater than S2. Note: This function may, in multibyte locales, return 0 for strings of different lengths! */ int strcasecmp (const char *s1, const char *s2) { if (s1 == s2) return 0; /* Be careful not to look at the entire extent of s1 or s2 until needed. This is useful because when two strings differ, the difference is most often already in the very few first characters. */ #if HAVE_MBRTOWC if (MB_CUR_MAX > 1) { mbui_iterator_t iter1; mbui_iterator_t iter2; mbui_init (iter1, s1); mbui_init (iter2, s2); while (mbui_avail (iter1) && mbui_avail (iter2)) { int cmp = mb_casecmp (mbui_cur (iter1), mbui_cur (iter2)); if (cmp != 0) return cmp; mbui_advance (iter1); mbui_advance (iter2); } if (mbui_avail (iter1)) /* s2 terminated before s1. */ return 1; if (mbui_avail (iter2)) /* s1 terminated before s2. */ return -1; return 0; } else #endif { const unsigned char *p1 = (const unsigned char *) s1; const unsigned char *p2 = (const unsigned char *) s2; unsigned char c1, c2; do { c1 = TOLOWER (*p1); c2 = TOLOWER (*p2); if (c1 == '\0') break; ++p1; ++p2; } while (c1 == c2); if (UCHAR_MAX <= INT_MAX) return c1 - c2; else /* On machines where 'char' and 'int' are types of the same size, the difference of two 'unsigned char' values - including the sign bit - doesn't fit in an 'int'. */ return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0); } }
/* Tests whether STRING contains trim (SUB), starting and ending at word boundaries. Here, instead of implementing Unicode Standard Annex #29 for determining word boundaries, we assume that trim (SUB) starts and ends with words and only test whether the part before it ends with a non-word and the part after it starts with a non-word. */ static bool mbsstr_trimmed_wordbounded (const char *string, const char *sub) { char *tsub = trim (sub); bool found = false; for (; *string != '\0';) { const char *tsub_in_string = mbsstr (string, tsub); if (tsub_in_string == NULL) break; else { if (MB_CUR_MAX > 1) { mbui_iterator_t string_iter; bool word_boundary_before; bool word_boundary_after; mbui_init (string_iter, string); word_boundary_before = true; if (mbui_cur_ptr (string_iter) < tsub_in_string) { mbchar_t last_char_before_tsub; do { if (!mbui_avail (string_iter)) abort (); last_char_before_tsub = mbui_cur (string_iter); mbui_advance (string_iter); } while (mbui_cur_ptr (string_iter) < tsub_in_string); if (mb_isalnum (last_char_before_tsub)) word_boundary_before = false; } mbui_init (string_iter, tsub_in_string); { mbui_iterator_t tsub_iter; for (mbui_init (tsub_iter, tsub); mbui_avail (tsub_iter); mbui_advance (tsub_iter)) { if (!mbui_avail (string_iter)) abort (); mbui_advance (string_iter); } } word_boundary_after = true; if (mbui_avail (string_iter)) { mbchar_t first_char_after_tsub = mbui_cur (string_iter); if (mb_isalnum (first_char_after_tsub)) word_boundary_after = false; } if (word_boundary_before && word_boundary_after) { found = true; break; } mbui_init (string_iter, tsub_in_string); if (!mbui_avail (string_iter)) break; string = tsub_in_string + mb_len (mbui_cur (string_iter)); } else { bool word_boundary_before; const char *p; bool word_boundary_after; word_boundary_before = true; if (string < tsub_in_string) if (isalnum ((unsigned char) tsub_in_string[-1])) word_boundary_before = false; p = tsub_in_string + strlen (tsub); word_boundary_after = true; if (*p != '\0') if (isalnum ((unsigned char) *p)) word_boundary_after = false; if (word_boundary_before && word_boundary_after) { found = true; break; } if (*tsub_in_string == '\0') break; string = tsub_in_string + 1; } } } free (tsub); return found; }
/* Find the first occurrence of NEEDLE in HAYSTACK. */ char * strstr (const char *haystack, const char *needle) { /* Be careful not to look at the entire extent of haystack or needle until needed. This is useful because of these two cases: - haystack may be very long, and a match of needle found early, - needle may be very long, and not even a short initial segment of needle may be found in haystack. */ #if HAVE_MBRTOWC if (MB_CUR_MAX > 1) { mbui_iterator_t iter_needle; mbui_init (iter_needle, needle); if (mbui_avail (iter_needle)) { mbui_iterator_t iter_haystack; mbui_init (iter_haystack, haystack); for (;; mbui_advance (iter_haystack)) { if (!mbui_avail (iter_haystack)) /* No match. */ return NULL; if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) /* The first character matches. */ { mbui_iterator_t rhaystack; mbui_iterator_t rneedle; memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); mbui_advance (rhaystack); mbui_init (rneedle, needle); if (!mbui_avail (rneedle)) abort (); mbui_advance (rneedle); for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) { if (!mbui_avail (rneedle)) /* Found a match. */ return (char *) mbui_cur_ptr (iter_haystack); if (!mbui_avail (rhaystack)) /* No match. */ return NULL; if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) /* Nothing in this round. */ break; } } } } else return (char *) haystack; } else #endif { if (*needle != '\0') { /* Speed up the following searches of needle by caching its first character. */ char b = *needle++; for (;; haystack++) { if (*haystack == '\0') /* No match. */ return NULL; if (*haystack == b) /* The first character matches. */ { const char *rhaystack = haystack + 1; const char *rneedle = needle; for (;; rhaystack++, rneedle++) { if (*rneedle == '\0') /* Found a match. */ return (char *) haystack; if (*rhaystack == '\0') /* No match. */ return NULL; if (*rhaystack != *rneedle) /* Nothing in this round. */ break; } } } } else return (char *) haystack; } }
/* Find the first occurrence in the character string STRING of any character not in the character string REJECT. Return the number of bytes from the beginning of the string to this occurrence, or to the end of the string if none exists. */ size_t mbsspn (const char *string, const char *reject) { /* Optimize two cases. */ if (reject[0] == '\0') return 0; if (reject[1] == '\0') { unsigned char uc = (unsigned char) reject[0]; if (MB_CUR_MAX > 1) { mbui_iterator_t iter; for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) if (!(mb_len (mbui_cur (iter)) == 1 && (unsigned char) * mbui_cur_ptr (iter) == uc)) break; return mbui_cur_ptr (iter) - string; } else { const char *ptr; for (ptr = string; *ptr != '\0'; ptr++) if ((unsigned char) *ptr != uc) break; return ptr - string; } } /* General case. */ if (MB_CUR_MAX > 1) { mbui_iterator_t iter; for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) { if (mb_len (mbui_cur (iter)) == 1) { if (mbschr (reject, * mbui_cur_ptr (iter)) == NULL) goto found; } else { mbui_iterator_t aiter; for (mbui_init (aiter, reject);; mbui_advance (aiter)) { if (!mbui_avail (aiter)) goto found; if (mb_equal (mbui_cur (aiter), mbui_cur (iter))) break; } } } found: return mbui_cur_ptr (iter) - string; } else return strspn (string, reject); }