static const char * fast_validate (const char *str) { unsigned int val = 0; unsigned int min = 0; const char *p; for (p = str; *p; p++) { if (*(unsigned char *)p < 128) /* done */; else { const char *last; last = p; if ((*(unsigned char *)p & 0xe0) == 0xc0) /* 110xxxxx */ { if ((*(unsigned char *)p & 0x1e) == 0) goto error; p++; if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ goto error; } else { if ((*(unsigned char *)p & 0xf0) == 0xe0) /* 1110xxxx */ { min = (1 << 11); val = *(unsigned char *)p & 0x0f; goto TWO_REMAINING; } else if ((*(unsigned char *)p & 0xf8) == 0xf0) /* 11110xxx */ { min = (1 << 16); val = *(unsigned char *)p & 0x07; } else goto error; p++; CONTINUATION_CHAR; TWO_REMAINING: p++; CONTINUATION_CHAR; p++; CONTINUATION_CHAR; if (val < min) goto error; if (!UNICODE_VALID(val)) goto error; } continue; error: return last; } } return p; }
static int utf8_validate (const char *str) { gunichar val = 0; gunichar min = 0; const char *p; for (p = str; *p; p++) { if (*(guchar *)p < 128) /* done */; else { const char *last; last = p; if ((*(guchar *)p & 0xe0) == 0xc0) { /* 110xxxxx */ if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) goto error; p++; if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ goto error; } else { if ((*(guchar *)p & 0xf0) == 0xe0) { /* 1110xxxx */ min = (1 << 11); val = *(guchar *)p & 0x0f; goto TWO_REMAINING; } else if ((*(guchar *)p & 0xf8) == 0xf0) { /* 11110xxx */ min = (1 << 16); val = *(guchar *)p & 0x07; } else goto error; p++; CONTINUATION_CHAR; TWO_REMAINING: p++; CONTINUATION_CHAR; p++; CONTINUATION_CHAR; if (G_UNLIKELY (val < min)) goto error; if (G_UNLIKELY (!UNICODE_VALID(val))) goto error; } continue; error: return 0; } } return *p == 0; }
/** * _cairo_utf8_to_utf16: * @str: an UTF-8 string * @len: length of @str in bytes, or -1 if it is nul-terminated. * If @len is supplied and the string has an embedded nul * byte, only the portion before the nul byte is converted. * @result: location to store a pointer to a newly allocated UTF-16 * string (always native endian). Free with free(). A 0 * word will be written after the last character. * @items_written: location to store number of 16-bit words * written. (Not including the trailing 0) * * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode * where characters are represented either as a single 16-bit word, or * as a pair of 16-bit "surrogates". The string is validated to * consist entirely of valid Unicode characters. * * Return value: %CAIRO_STATUS_SUCCESS if the entire string was * succesfully converted. %CAIRO_STATUS_INVALID_STRING if an * an invalid sequence was found. **/ cairo_status_t _cairo_utf8_to_utf16 (const unsigned char *str, int len, uint16_t **result, int *items_written) { uint16_t *str16 = NULL; int n16, i; const unsigned char *in; in = str; n16 = 0; while ((len < 0 || str + len - in > 0) && *in) { uint32_t wc = _utf8_get_char_extended (in, str + len - in); if (wc & 0x80000000 || !UNICODE_VALID (wc)) return CAIRO_STATUS_INVALID_STRING; if (wc < 0x10000) n16 += 1; else n16 += 2; if (n16 == INT_MAX - 1 || n16 == INT_MAX) return CAIRO_STATUS_INVALID_STRING; in = UTF8_NEXT_CHAR (in); } str16 = malloc (sizeof (uint16_t) * (n16 + 1)); if (!str16) return CAIRO_STATUS_NO_MEMORY; in = str; for (i = 0; i < n16;) { uint32_t wc = _utf8_get_char (in); if (wc < 0x10000) { str16[i++] = wc; } else { str16[i++] = (wc - 0x10000) / 0x400 + 0xd800; str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00; } in = UTF8_NEXT_CHAR (in); } str16[i] = 0; *result = str16; if (items_written) *items_written = n16; return CAIRO_STATUS_SUCCESS; }
/** * g_utf8_get_char_validated: * @p: a pointer to Unicode character encoded as UTF-8 * @max_len: the maximum number of bytes to read, or -1, for no maximum. * * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. * This function checks for incomplete characters, for invalid characters * such as characters that are out of the range of Unicode, and for * overlong encodings of valid characters. * * Return value: the resulting character. If @p points to a partial * sequence at the end of a string that could begin a valid * character, returns (gunichar)-2; otherwise, if @p does not point * to a valid UTF-8 encoded Unicode character, returns (gunichar)-1. **/ gunichar g_utf8_get_char_validated (const gchar *p, gssize max_len) { gunichar result = g_utf8_get_char_extended (p, max_len); if (result & 0x80000000) return result; else if (!UNICODE_VALID (result)) return (gunichar)-1; else return result; }
/** * _cairo_utf8_to_ucs4: * @str: an UTF-8 string * @len: length of @str in bytes, or -1 if it is nul-terminated. * If @len is supplied and the string has an embedded nul * byte, only the portion before the nul byte is converted. * @result: location to store a pointer to a newly allocated UTF-32 * string (always native endian), or %NULL. Free with free(). A 0 * word will be written after the last character. * @items_written: location to store number of 32-bit words * written. (Not including the trailing 0) * * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode * with 1 32-bit word per character. The string is validated to * consist entirely of valid Unicode characters. * * Return value: %CAIRO_STATUS_SUCCESS if the entire string was * successfully converted. %CAIRO_STATUS_INVALID_STRING if an * invalid sequence was found. **/ cairo_status_t _cairo_utf8_to_ucs4 (const char *str, int len, uint32_t **result, int *items_written) { uint32_t *str32 = NULL; int n_chars, i; const unsigned char *in; const unsigned char * const ustr = (const unsigned char *) str; in = ustr; n_chars = 0; while ((len < 0 || ustr + len - in > 0) && *in) { uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); if (wc & 0x80000000 || !UNICODE_VALID (wc)) return _cairo_error (CAIRO_STATUS_INVALID_STRING); n_chars++; if (n_chars == INT_MAX) return _cairo_error (CAIRO_STATUS_INVALID_STRING); in = UTF8_NEXT_CHAR (in); } if (result) { str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t)); if (!str32) return _cairo_error (CAIRO_STATUS_NO_MEMORY); in = ustr; for (i=0; i < n_chars; i++) { str32[i] = _utf8_get_char (in); in = UTF8_NEXT_CHAR (in); } str32[i] = 0; *result = str32; } if (items_written) *items_written = n_chars; return CAIRO_STATUS_SUCCESS; }
int _utf8_get_char_validated(const char *p, int max_len) { int result; if (max_len == 0) return -2; result = _utf8_get_char_extended(p, max_len); if (result & 0x80000000) return result; else if (!UNICODE_VALID(result)) return -1; else return result; }
/** * g_unichar_validate: * @ch: a Unicode character * * Checks whether @ch is a valid Unicode character. Some possible * integer values of @ch will not be valid. 0 is considered a valid * character, though it's normally a string terminator. * * Returns: %TRUE if @ch is a valid Unicode character **/ gboolean g_unichar_validate (gunichar ch) { return UNICODE_VALID (ch); }
const char * avahi_utf8_valid (const char *str) { unsigned val = 0; unsigned min = 0; const char *p; for (p = str; *p; p++) { if (*(const unsigned char *)p < 128) /* done */; else { if ((*(const unsigned char *)p & 0xe0) == 0xc0) /* 110xxxxx */ { if ( ((*(const unsigned char *)p & 0x1e) == 0)) goto error; p++; if ( ((*(const unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */ goto error; } else { if ((*(const unsigned char *)p & 0xf0) == 0xe0) /* 1110xxxx */ { min = (1 << 11); val = *(const unsigned char *)p & 0x0f; goto TWO_REMAINING; } else if ((*(const unsigned char *)p & 0xf8) == 0xf0) /* 11110xxx */ { min = (1 << 16); val = *(const unsigned char *)p & 0x07; } else goto error; p++; CONTINUATION_CHAR; TWO_REMAINING: p++; CONTINUATION_CHAR; p++; CONTINUATION_CHAR; if ( (val < min)) goto error; if ( (!UNICODE_VALID(val))) goto error; } continue; error: return NULL; } } return str; }
static const gchar * fast_validate_len (const char *str, gssize max_len) { gunichar val = 0; gunichar min = 0; const gchar *p; g_assert (max_len >= 0); for (p = str; ((p - str) < max_len) && *p; p++) { if (*(guchar *)p < 128) /* done */; else { const gchar *last; last = p; if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ { if (G_UNLIKELY (max_len - (p - str) < 2)) goto error; if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) goto error; p++; if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ goto error; } else { if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ { if (G_UNLIKELY (max_len - (p - str) < 3)) goto error; min = (1 << 11); val = *(guchar *)p & 0x0f; goto TWO_REMAINING; } else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ { if (G_UNLIKELY (max_len - (p - str) < 4)) goto error; min = (1 << 16); val = *(guchar *)p & 0x07; } else goto error; p++; CONTINUATION_CHAR; TWO_REMAINING: p++; CONTINUATION_CHAR; p++; CONTINUATION_CHAR; if (G_UNLIKELY (val < min)) goto error; if (G_UNLIKELY (!UNICODE_VALID(val))) goto error; } continue; error: return last; } } return p; }