/** * g_utf8_to_ucs4: * @str: a UTF-8 encoded string * @len: the maximum length of @str to use, in bytes. If @len < 0, * then the string is nul-terminated. * @items_read: (out caller-allocates) (optional): location to store number of * bytes read, or %NULL. * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be * returned in case @str contains a trailing partial * character. If an error occurs then the index of the * invalid input is stored here. * @items_written: (out caller-allocates) (optional): location to store number * of characters written or %NULL. The value here stored does not include * the trailing 0 character. * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from UTF-8 to a 32-bit fixed width * representation as UCS-4. A trailing 0 character will be added to the * string after the converted text. * * Returns: a pointer to a newly allocated UCS-4 string. * This value must be freed with g_free(). If an error occurs, * %NULL will be returned and @error set. */ gunichar * g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error) { gunichar *result = NULL; gint n_chars, i; const gchar *in; in = str; n_chars = 0; while ((len < 0 || str + len - in > 0) && *in) { gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); if (wc & 0x80000000) { if (wc == (gunichar)-2) { if (items_read) break; else g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, _("Partial character sequence at end of input")); } else g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Invalid byte sequence in conversion input")); goto err_out; } n_chars++; in = g_utf8_next_char (in); } result = try_malloc_n (n_chars + 1, sizeof (gunichar), error); if (result == NULL) goto err_out; in = str; for (i=0; i < n_chars; i++) { result[i] = g_utf8_get_char (in); in = g_utf8_next_char (in); } result[i] = 0; if (items_written) *items_written = n_chars; err_out: if (items_read) *items_read = in - str; return result; }
/** * g_utf8_to_ucs4: * @str: a UTF-8 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is nul-terminated. * @items_read: location to store number of bytes read, or %NULL. * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be * returned in case @str contains a trailing partial * character. If an error occurs then the index of the * invalid input is stored here. * @items_written: location to store number of characters written or %NULL. * The value here stored does not include the trailing 0 * character. * @error: location to store the error occuring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from UTF-8 to a 32-bit fixed width * representation as UCS-4. A trailing 0 will be added to the * string after the converted text. * * Return value: a pointer to a newly allocated UCS-4 string. * This value must be freed with g_free(). If an * error occurs, %NULL will be returned and * @error set. **/ wchar_t * g_utf8_to_ucs4 (const char *str, long len, long *items_read, long *items_written, wchar_t **error) { wchar_t *result = NULL; int n_chars, i; const char *in; in = str; n_chars = 0; while ((len < 0 || str + len - in > 0) && *in) { wchar_t wc = g_utf8_get_char_extended (in, str + len - in); if (wc & 0x80000000) { if (wc == (wchar_t)-2) { if (items_read) break; else if (error) *error = L"Partial character sequence at end of input"; } else if (error) *error = L"Invalid byte sequence in conversion input"; goto err_out; } n_chars++; in = g_utf8_next_char (in); } result = (wchar_t*)malloc((n_chars + 1) * sizeof(wchar_t)); in = str; for (i=0; i < n_chars; i++) { result[i] = g_utf8_get_char (in); in = g_utf8_next_char (in); } result[i] = 0; if (items_written) *items_written = n_chars; err_out: if (items_read) *items_read = in - str; return result; }
/** * g_utf8_get_char_validated: * @p: a pointer to Unicode character encoded as UTF-8 * @max_len: the maximum number of bytes to read, or -1, for no maximum. * * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. * This function checks for incomplete characters, for invalid characters * such as characters that are out of the range of Unicode, and for * overlong encodings of valid characters. * * Return value: the resulting character. If @p points to a partial * sequence at the end of a string that could begin a valid * character, returns (gunichar)-2; otherwise, if @p does not point * to a valid UTF-8 encoded Unicode character, returns (gunichar)-1. **/ gunichar g_utf8_get_char_validated (const gchar *p, gssize max_len) { gunichar result = g_utf8_get_char_extended (p, max_len); if (result & 0x80000000) return result; else if (!UNICODE_VALID (result)) return (gunichar)-1; else return result; }
/** * g_utf8_to_utf16: * @str: a UTF-8 encoded string * @len: the maximum length (number of bytes) of @str to use. * If @len < 0, then the string is nul-terminated. * @items_read: (out caller-allocates) (optional): location to store number of * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will * be returned in case @str contains a trailing partial character. If * an error occurs then the index of the invalid input is stored here. * @items_written: (out caller-allocates) (optional): location to store number * of #gunichar2 written, or %NULL. The value stored here does not include * the trailing 0. * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from UTF-8 to UTF-16. A 0 character will be * added to the result after the converted text. * * Returns: a pointer to a newly allocated UTF-16 string. * This value must be freed with g_free(). If an error occurs, * %NULL will be returned and @error set. */ gunichar2 * g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error) { gunichar2 *result = NULL; gint n16; const gchar *in; gint i; g_return_val_if_fail (str != NULL, NULL); in = str; n16 = 0; while ((len < 0 || str + len - in > 0) && *in) { gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); if (wc & 0x80000000) { if (wc == (gunichar)-2) { if (items_read) break; else g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, _("Partial character sequence at end of input")); } else g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Invalid byte sequence in conversion input")); goto err_out; } if (wc < 0xd800) n16 += 1; else if (wc < 0xe000) { g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Invalid sequence in conversion input")); goto err_out; } else if (wc < 0x10000) n16 += 1; else if (wc < 0x110000) n16 += 2; else { g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Character out of range for UTF-16")); goto err_out; } in = g_utf8_next_char (in); } result = try_malloc_n (n16 + 1, sizeof (gunichar2), error); if (result == NULL) goto err_out; in = str; for (i = 0; i < n16;) { gunichar wc = g_utf8_get_char (in); if (wc < 0x10000) { result[i++] = wc; } else { result[i++] = (wc - 0x10000) / 0x400 + 0xd800; result[i++] = (wc - 0x10000) % 0x400 + 0xdc00; } in = g_utf8_next_char (in); } result[i] = 0; if (items_written) *items_written = n16; err_out: if (items_read) *items_read = in - str; return result; }