int _utf8_get_char_extended(const char *s, int max_len) { const unsigned char*p = (const unsigned char*)s; int i, len; unsigned int wc = (unsigned char) * p; if (wc < 0x80) { return wc; } else if (wc < 0xc0) { return (unsigned int) - 1; } else if (wc < 0xe0) { len = 2; wc &= 0x1f; } else if (wc < 0xf0) { len = 3; wc &= 0x0f; } else if (wc < 0xf8) { len = 4; wc &= 0x07; } else if (wc < 0xfc) { len = 5; wc &= 0x03; } else if (wc < 0xfe) { len = 6; wc &= 0x01; } else { return (unsigned int) - 1; } if (max_len >= 0 && len > max_len) { for (i = 1; i < max_len; i++) { if ((((unsigned char *)p)[i] & 0xc0) != 0x80) return (unsigned int) - 1; } return (unsigned int) - 2; } for (i = 1; i < len; ++i) { unsigned int ch = ((unsigned char *)p)[i]; if ((ch & 0xc0) != 0x80) { if (ch) return (unsigned int) - 1; else return (unsigned int) - 2; } wc <<= 6; wc |= (ch & 0x3f); } if (UTF8_LENGTH(wc) != len) return (unsigned int) - 1; return wc; }
static void test_valid_turkish() { long nwritten; long nread; char *res; int i; long size; gunichar *verify; unsigned char *back; unsigned char buf[2]; static int map_size = sizeof(gsm_turkish_to_unicode_map) / sizeof(unsigned short) / 2; for (i = 0; i < map_size; i++) { unsigned short c = gsm_turkish_to_unicode_map[i*2]; if (c & 0x1b00) { buf[0] = 0x1b; buf[1] = c & 0x7f; size = 2; } else { size = 1; buf[0] = c & 0x7f; } res = convert_gsm_to_utf8_with_lang(buf, size, &nread, &nwritten, 0, 1, 1); g_assert(res); if (g_test_verbose()) g_print("size: %ld, nread:%ld, nwritten:%ld, %s\n", size, nread, nwritten, res); g_assert(nread == size); verify = g_utf8_to_ucs4(res, -1, NULL, NULL, NULL); g_assert(verify[0] == gsm_turkish_to_unicode_map[i*2+1]); g_assert(verify[1] == 0); g_assert(nwritten == UTF8_LENGTH(verify[0])); back = convert_utf8_to_gsm_with_lang(res, -1, &nread, &nwritten, 0, 1, 1); g_assert(back); g_assert(nwritten == size); if (c & 0x1b00) { g_assert(back[0] == 0x1b); g_assert(back[1] == (c & 0x7f)); } else { g_assert(back[0] == (c & 0x7f)); } g_free(back); g_free(verify); g_free(res); } }
/* * g_ucs4_to_utf8: * @str: a UCS-4 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is terminated with a 0 character. * @items_read: location to store number of characters read read, or %NULL. * @items_written: location to store number of bytes written or %NULL. * The value here stored does not include the trailing 0 * byte. * @error: location to store the error occuring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from a 32-bit fixed width representation as UCS-4. * to UTF-8. The result will be terminated with a 0 byte. * * Return value: a pointer to a newly allocated UTF-8 string. * This value must be freed with g_free(). If an * error occurs, %NULL will be returned and * @error set. **/ gchar * g_ucs4_to_utf8 (const gunichar * str, glong len, glong * items_read, glong * items_written, GError ** error) { gint result_length; gchar *result = NULL; gchar *p; gint i; result_length = 0; for (i = 0; len < 0 || i < len; i++) { if (!str[i]) break; if (str[i] >= 0x80000000) { if (items_read) *items_read = i; g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Character out of range for UTF-8")); goto err_out; } result_length += UTF8_LENGTH (str[i]); } result = g_malloc (result_length + 1); if (!result) return NULL; p = result; i = 0; while (p < result + result_length) p += g_unichar_to_utf8 (str[i++], p); *p = '\0'; if (items_written) *items_written = p - result; err_out: if (items_read) *items_read = i; return result; }
/** * g_ucs4_to_utf8: * @str: a UCS-4 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is terminated with a 0 character. * @items_read: location to store number of characters read read, or %NULL. * @items_written: location to store number of bytes written or %NULL. * The value here stored does not include the trailing 0 * byte. * @error: location to store the error occuring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from a 32-bit fixed width representation as UCS-4. * to UTF-8. The result will be terminated with a 0 byte. * * Return value: a pointer to a newly allocated UTF-8 string. * This value must be freed with g_free(). If an * error occurs, %NULL will be returned and * @error set. **/ char * g_ucs4_to_utf8 (const wchar_t *str, long len, long *items_read, long *items_written, wchar_t **error) { int result_length; char *result = NULL; char *p; int i; result_length = 0; for (i = 0; len < 0 || i < len ; i++) { if (!str[i]) break; if ((unsigned)str[i] >= 0x80000000) { if (items_read) *items_read = i; if (error) *error = L"Character out of range for UTF-8"; goto err_out; } result_length += UTF8_LENGTH (str[i]); } result = (char*)malloc (result_length + 1); p = result; i = 0; while (p < result + result_length) p += g_unichar_to_utf8 (str[i++], p); *p = '\0'; if (items_written) *items_written = p - result; err_out: if (items_read) *items_read = i; return result; }
/* * g_ucs4_to_utf8: * @str: a UCS-4 encoded string * @len: the maximum length (number of characters) of @str to use. * If @len < 0, then the string is nul-terminated. * @items_read: location to store number of characters read, or %NULL. * @items_written: location to store number of bytes written or %NULL. * The value here stored does not include the trailing 0 * byte. * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from a 32-bit fixed width representation as UCS-4. * to UTF-8. The result will be terminated with a 0 byte. * * Return value: a pointer to a newly allocated UTF-8 string. * This value must be freed with g_free(). If an * error occurs, %NULL will be returned and * @error set. In that case, @items_read will be * set to the position of the first invalid input * character. **/ static gchar * g_ucs4_to_utf8 (const gunichar * str, glong len, glong * items_read, glong * items_written) { gint result_length; gchar *result = NULL; gchar *p; gint i; result_length = 0; for (i = 0; len < 0 || i < len; i++) { if (!str[i]) break; if (str[i] >= 0x80000000) goto err_out; result_length += UTF8_LENGTH (str[i]); } result = g_malloc (result_length + 1); if (!result) return NULL; p = result; i = 0; while (p < result + result_length) p += g_unichar_to_utf8 (str[i++], p); *p = '\0'; if (items_written) *items_written = p - result; err_out: if (items_read) *items_read = i; return result; }
/** * cc_utf16_to_utf8: * @str: a UTF-16 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is terminated with a 0 character. * @items_read: location to store number of words read, or %nullptr. * If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be * returned in case @str contains a trailing partial * character. If an error occurs then the index of the * invalid input is stored here. * @items_written: location to store number of bytes written, or %nullptr. * The value stored here does not include the trailing * 0 byte. * @error: location to store the error occuring, or %nullptr to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from UTF-16 to UTF-8. The result will be * terminated with a 0 byte. * * Return value: a pointer to a newly allocated UTF-8 string. * This value must be freed with free(). If an * error occurs, %nullptr will be returned and * @error set. **/ char * cc_utf16_to_utf8 (const unsigned short *str, int len, long *items_read, long *items_written) { /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ * are marked. */ const unsigned short *in; char *out; char *result = nullptr; int n_bytes; unsigned int high_surrogate; if (str == 0) return nullptr; n_bytes = 0; in = str; high_surrogate = 0; while ((len < 0 || in - str < len) && *in) { unsigned short c = *in; unsigned int wc; if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ { if (high_surrogate) { wc = SURROGATE_VALUE (high_surrogate, c); high_surrogate = 0; } else { CCLOGERROR("Invalid sequence in conversion input"); goto err_out; } } else { if (high_surrogate) { CCLOGERROR("Invalid sequence in conversion input"); goto err_out; } if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ { high_surrogate = c; goto next1; } else wc = c; } /********** DIFFERENT for UTF8/UCS4 **********/ n_bytes += UTF8_LENGTH (wc); next1: in++; } if (high_surrogate && !items_read) { CCLOGERROR("Partial character sequence at end of input"); goto err_out; } /* At this point, everything is valid, and we just need to convert */ /********** DIFFERENT for UTF8/UCS4 **********/ result = new char[n_bytes + 1]; high_surrogate = 0; out = result; in = str; while (out < result + n_bytes) { unsigned short c = *in; unsigned int wc; if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ { wc = SURROGATE_VALUE (high_surrogate, c); high_surrogate = 0; } else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ { high_surrogate = c; goto next2; } else wc = c; /********** DIFFERENT for UTF8/UCS4 **********/ out += cc_unichar_to_utf8 (wc, out); next2: in++; } /********** DIFFERENT for UTF8/UCS4 **********/ *out = '\0'; if (items_written) /********** DIFFERENT for UTF8/UCS4 **********/ *items_written = out - result; err_out: if (items_read) *items_read = in - str; return result; }
/** * g_utf16_to_utf8: * @str: a UTF-16 encoded string * @len: the maximum length (number of #gunichar2) of @str to use. * If @len < 0, then the string is nul-terminated. * @items_read: (out caller-allocates) (optional): location to store number of * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will * be returned in case @str contains a trailing partial character. If * an error occurs then the index of the invalid input is stored here. * @items_written: (out caller-allocates) (optional): location to store number * of bytes written, or %NULL. The value stored here does not include the * trailing 0 byte. * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError other than * %G_CONVERT_ERROR_NO_CONVERSION may occur. * * Convert a string from UTF-16 to UTF-8. The result will be * terminated with a 0 byte. * * Note that the input is expected to be already in native endianness, * an initial byte-order-mark character is not handled specially. * g_convert() can be used to convert a byte buffer of UTF-16 data of * ambiguous endianess. * * Further note that this function does not validate the result * string; it may e.g. include embedded NUL characters. The only * validation done by this function is to ensure that the input can * be correctly interpreted as UTF-16, i.e. it doesn't contain * things unpaired surrogates. * * Returns: a pointer to a newly allocated UTF-8 string. * This value must be freed with g_free(). If an error occurs, * %NULL will be returned and @error set. **/ gchar * g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error) { /* This function and g_utf16_to_ucs4 are almost exactly identical - * The lines that differ are marked. */ const gunichar2 *in; gchar *out; gchar *result = NULL; gint n_bytes; gunichar high_surrogate; g_return_val_if_fail (str != NULL, NULL); n_bytes = 0; in = str; high_surrogate = 0; while ((len < 0 || in - str < len) && *in) { gunichar2 c = *in; gunichar wc; if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ { if (high_surrogate) { wc = SURROGATE_VALUE (high_surrogate, c); high_surrogate = 0; } else { g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Invalid sequence in conversion input")); goto err_out; } } else { if (high_surrogate) { g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Invalid sequence in conversion input")); goto err_out; } if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ { high_surrogate = c; goto next1; } else wc = c; } /********** DIFFERENT for UTF8/UCS4 **********/ n_bytes += UTF8_LENGTH (wc); next1: in++; } if (high_surrogate && !items_read) { g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, _("Partial character sequence at end of input")); goto err_out; } /* At this point, everything is valid, and we just need to convert */ /********** DIFFERENT for UTF8/UCS4 **********/ result = try_malloc_n (n_bytes + 1, 1, error); if (result == NULL) goto err_out; high_surrogate = 0; out = result; in = str; while (out < result + n_bytes) { gunichar2 c = *in; gunichar wc; if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ { wc = SURROGATE_VALUE (high_surrogate, c); high_surrogate = 0; } else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ { high_surrogate = c; goto next2; } else wc = c; /********** DIFFERENT for UTF8/UCS4 **********/ out += g_unichar_to_utf8 (wc, out); next2: in++; } /********** DIFFERENT for UTF8/UCS4 **********/ *out = '\0'; if (items_written) /********** DIFFERENT for UTF8/UCS4 **********/ *items_written = out - result; err_out: if (items_read) *items_read = in - str; return result; }
/* Like g_utf8_get_char, but take a maximum length * and return (gunichar)-2 on incomplete trailing character */ static inline gunichar g_utf8_get_char_extended (const gchar *p, gssize max_len) { guint i, len; gunichar wc = (guchar) *p; if (wc < 0x80) { return wc; } else if (wc < 0xc0) { return (gunichar)-1; } else if (wc < 0xe0) { len = 2; wc &= 0x1f; } else if (wc < 0xf0) { len = 3; wc &= 0x0f; } else if (wc < 0xf8) { len = 4; wc &= 0x07; } else if (wc < 0xfc) { len = 5; wc &= 0x03; } else if (wc < 0xfe) { len = 6; wc &= 0x01; } else { return (gunichar)-1; } if (max_len >= 0 && len > max_len) { for (i = 1; i < max_len; i++) { if ((((guchar *)p)[i] & 0xc0) != 0x80) return (gunichar)-1; } return (gunichar)-2; } for (i = 1; i < len; ++i) { gunichar ch = ((guchar *)p)[i]; if ((ch & 0xc0) != 0x80) { if (ch) return (gunichar)-1; else return (gunichar)-2; } wc <<= 6; wc |= (ch & 0x3f); } if (UTF8_LENGTH(wc) != len) return (gunichar)-1; return wc; }