/* This function is equivalent to strncasecmp() for multibyte * strings. */ int mbstrncasecmp(const char *s1, const char *s2, size_t n) { #ifdef ENABLE_UTF8 if (use_utf8) { char *s1_mb, *s2_mb; wchar_t ws1, ws2; if (s1 == s2) return 0; assert(s1 != NULL && s2 != NULL); s1_mb = charalloc(MB_CUR_MAX); s2_mb = charalloc(MB_CUR_MAX); for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1 += move_mbright(s1, 0), s2 += move_mbright(s2, 0), n--) { bool bad_s1_mb = FALSE, bad_s2_mb = FALSE; int s1_mb_len, s2_mb_len; s1_mb_len = parse_mbchar(s1, s1_mb, NULL); if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) { mbtowc_reset(); ws1 = (unsigned char)*s1_mb; bad_s1_mb = TRUE; } s2_mb_len = parse_mbchar(s2, s2_mb, NULL); if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) { mbtowc_reset(); ws2 = (unsigned char)*s2_mb; bad_s2_mb = TRUE; } if (bad_s1_mb != bad_s2_mb || towlower(ws1) != towlower(ws2)) break; } free(s1_mb); free(s2_mb); return (n > 0) ? towlower(ws1) - towlower(ws2) : 0; } else #endif return strncasecmp(s1, s2, n); }
/* This function is equivalent to wcwidth() for multibyte characters. */ int mbwidth(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; int width; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); wc = bad_wchar; } width = wcwidth(wc); if (width == -1) { wc = bad_wchar; width = wcwidth(wc); } return width; } else #endif return 1; }
/* c is a multibyte non-control character. We return that multibyte * character. If crep is an invalid multibyte sequence, it will be * replaced with Unicode 0xFFFD (Replacement Character). */ char *mbrep(const char *c, char *crep, int *crep_len) { assert(c != NULL && crep != NULL && crep_len != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; /* Reject invalid Unicode characters. */ if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) { mbtowc_reset(); *crep_len = bad_mbchar_len; strncpy(crep, bad_mbchar, *crep_len); } else { *crep_len = wctomb(crep, wc); if (*crep_len < 0) { wctomb_reset(); *crep_len = 0; } } } else { #endif *crep_len = 1; *crep = *c; #ifdef ENABLE_UTF8 } #endif return crep; }
/* c is a multibyte control character. It displays as ^@, ^?, or ^[ch], * where ch is (c + 64). We return that multibyte character. If crep * is an invalid multibyte sequence, it will be replaced with Unicode * 0xFFFD (Replacement Character). */ char *control_mbrep(const char *c, char *crep, int *crep_len) { assert(c != NULL && crep != NULL && crep_len != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); *crep_len = bad_mbchar_len; strncpy(crep, bad_mbchar, *crep_len); } else { *crep_len = wctomb(crep, control_wrep(wc)); if (*crep_len < 0) { wctomb_reset(); *crep_len = 0; } } } else { #endif *crep_len = 1; *crep = control_rep(*c); #ifdef ENABLE_UTF8 } #endif return crep; }
/* Assess how many bytes the given (multibyte) character occupies. Return -1 * if the byte sequence is invalid, and return the number of bytes minus 8 * when it encodes an invalid codepoint. Also, in the second parameter, * return the number of columns that the character occupies. */ int length_of_char(const char *c, int *width) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; int charlen = mbtowc(&wc, c, MB_CUR_MAX); /* If the sequence is invalid... */ if (charlen < 0) { mbtowc_reset(); return -1; } /* If the codepoint is invalid... */ if (!is_valid_unicode(wc)) return charlen - 8; else { *width = wcwidth(wc); /* If the codepoint is unassigned, assume a width of one. */ if (*width < 0) *width = 1; return charlen; } } else #endif return 1; }
/* This function is equivalent to strchr() for multibyte strings. */ char *mbstrchr(const char *s, const char *c) { assert(s != NULL && c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { bool bad_s_mb = FALSE, bad_c_mb = FALSE; char *s_mb = charalloc(MB_CUR_MAX); const char *q = s; wchar_t ws, wc; int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); if (c_mb_len < 0) { mbtowc_reset(); wc = (unsigned char)*c; bad_c_mb = TRUE; } while (*s != '\0') { int s_mb_len = parse_mbchar(s, s_mb, NULL); if (mbtowc(&ws, s_mb, s_mb_len) < 0) { mbtowc_reset(); ws = (unsigned char)*s; bad_s_mb = TRUE; } if (bad_s_mb == bad_c_mb && ws == wc) break; s += s_mb_len; q += s_mb_len; } free(s_mb); if (*s == '\0') q = NULL; return (char *)q; } else #endif return strchr(s, *c); }
/* This function is equivalent to strncasecmp() for multibyte strings. */ int mbstrncasecmp(const char *s1, const char *s2, size_t n) { #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc1, wc2; while (*s1 != '\0' && *s2 != '\0' && n > 0) { bool bad1 = FALSE, bad2 = FALSE; if (mbtowc(&wc1, s1, MB_CUR_MAX) < 0) { mbtowc_reset(); bad1 = TRUE; } if (mbtowc(&wc2, s2, MB_CUR_MAX) < 0) { mbtowc_reset(); bad2 = TRUE; } if (bad1 || bad2) { if (*s1 != *s2) return (unsigned char)*s1 - (unsigned char)*s2; if (bad1 != bad2) return (bad1 ? 1 : -1); } else { int difference = towlower(wc1) - towlower(wc2); if (difference != 0) return difference; } s1 += move_mbright(s1, 0); s2 += move_mbright(s2, 0); n--; } return (n > 0) ? ((unsigned char)*s1 - (unsigned char)*s2) : 0; } else #endif return strncasecmp(s1, s2, n); }
/* This function is equivalent to isalnum() for multibyte characters. */ bool is_alnum_mbchar(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); wc = bad_wchar; } return iswalnum(wc); } else #endif return isalnum((unsigned char)*c); }
/* This function is equivalent to ispunct() for multibyte characters. */ bool is_punct_mbchar(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); return 0; } return iswpunct(wc); } else #endif return ispunct((unsigned char)*c); }