/** * Convert the len characters long character sequence * given in input that is in the given input charset * to a string in given output charset. * * @param input input string * @param len number of bytes in @a input * @param input_charset character set used for @a input * @param output_charset desired character set for the return value * @return the converted string (0-terminated), * if conversion fails, a copy of the orignal * string is returned. */ char * GNUNET_STRINGS_conv (const char *input, size_t len, const char *input_charset, const char *output_charset) { char *ret; uint8_t *u8_string; char *encoded_string; size_t u8_string_length; size_t encoded_string_length; u8_string = u8_conv_from_encoding (input_charset, iconveh_error, input, len, NULL, NULL, &u8_string_length); if (NULL == u8_string) { LOG_STRERROR (GNUNET_ERROR_TYPE_WARNING, "u8_conv_from_encoding"); goto fail; } if (0 == strcmp (output_charset, "UTF-8")) { ret = GNUNET_malloc (u8_string_length + 1); memcpy (ret, u8_string, u8_string_length); ret[u8_string_length] = '\0'; free (u8_string); return ret; } encoded_string = u8_conv_to_encoding (output_charset, iconveh_error, u8_string, u8_string_length, NULL, NULL, &encoded_string_length); free (u8_string); if (NULL == encoded_string) { LOG_STRERROR (GNUNET_ERROR_TYPE_WARNING, "u8_conv_to_encoding"); goto fail; } ret = GNUNET_malloc (encoded_string_length + 1); memcpy (ret, encoded_string, encoded_string_length); ret[encoded_string_length] = '\0'; free (encoded_string); return ret; fail: LOG (GNUNET_ERROR_TYPE_WARNING, _("Character sets requested were `%s'->`%s'\n"), "UTF-8", output_charset); ret = GNUNET_malloc (len + 1); memcpy (ret, input, len); ret[len] = '\0'; return ret; }
static uint8_t * ulc_u8_casefold (const char *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp) { uint8_t convbuf[2048 / sizeof (uint8_t)]; uint8_t *conv; size_t conv_length; uint8_t *result; /* Convert the string to UTF-8. */ conv_length = sizeof (convbuf) / sizeof (uint8_t); conv = u8_conv_from_encoding (locale_charset (), iconveh_error, s, n, NULL, convbuf, &conv_length); if (conv == NULL) /* errno is set here. */ return NULL; /* Case-fold and normalize. */ result = u8_casefold (conv, conv_length, iso639_language, nf, resultbuf, lengthp); if (result == NULL) { if (conv != convbuf) { int saved_errno = errno; free (conv); errno = saved_errno; } return NULL; } if (conv != convbuf) free (conv); return result; }
void ulc_wordbreaks (const char *s, size_t n, char *p) { if (n > 0) { const char *encoding = locale_charset (); if (is_utf8_encoding (encoding)) u8_wordbreaks ((const uint8_t *) s, n, p); else { /* Convert the string to UTF-8 and build a translation table from offsets into s to offsets into the translated string. */ size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); if (offsets != NULL) { uint8_t *t; size_t m; t = u8_conv_from_encoding (encoding, iconveh_question_mark, s, n, offsets, NULL, &m); if (t != NULL) { char *q = (char *) (m > 0 ? malloc (m) : NULL); if (m == 0 || q != NULL) { size_t i; /* Determine the word breaks of the UTF-8 string. */ u8_wordbreaks (t, m, q); /* Translate the result back to the original string. */ memset (p, 0, n); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) p[i] = q[offsets[i]]; free (q); free (t); free (offsets); return; } free (t); } free (offsets); } /* Impossible to convert. */ #if C_CTYPE_ASCII if (is_all_ascii (s, n)) { /* ASCII is a subset of UTF-8. */ u8_wordbreaks ((const uint8_t *) s, n, p); return; } #endif /* We have a non-ASCII string and cannot convert it. Don't produce any word breaks. */ memset (p, 0, n); } } }
int main () { static enum iconv_ilseq_handler handlers[] = { iconveh_error, iconveh_question_mark, iconveh_escape_sequence }; size_t h; size_t o; size_t i; #if HAVE_ICONV /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1, ISO-8859-2, and UTF-8. */ /* Test conversion from ISO-8859-1 to UTF-8 with no errors. */ for (h = 0; h < SIZEOF (handlers); h++) { enum iconv_ilseq_handler handler = handlers[h]; static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; static const uint8_t expected[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237"; for (o = 0; o < 2; o++) { size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); size_t length; uint8_t *result = u8_conv_from_encoding ("ISO-8859-1", handler, input, strlen (input), offsets, NULL, &length); ASSERT (result != NULL); ASSERT (length == u8_strlen (expected)); ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0); if (o) { for (i = 0; i < 37; i++) ASSERT (offsets[i] == (i < 1 ? i : i < 12 ? i + 1 : i < 18 ? i + 2 : i + 3)); ASSERT (offsets[37] == MAGIC); free (offsets); } free (result); } } /* Test conversion from ISO-8859-2 to UTF-8 with no errors. */ for (h = 0; h < SIZEOF (handlers); h++) { enum iconv_ilseq_handler handler = handlers[h]; static const char input[] = "Rafa\263 Maszkowski"; /* Rafał Maszkowski */ static const uint8_t expected[] = "Rafa\305\202 Maszkowski"; for (o = 0; o < 2; o++) { size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); size_t length; uint8_t *result = u8_conv_from_encoding ("ISO-8859-2", handler, input, strlen (input), offsets, NULL, &length); ASSERT (result != NULL); ASSERT (length == u8_strlen (expected)); ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0); if (o) { for (i = 0; i < 16; i++) ASSERT (offsets[i] == (i < 5 ? i : i + 1)); ASSERT (offsets[16] == MAGIC); free (offsets); } free (result); } } /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2. */ # if defined _LIBICONV_VERSION || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun) /* Test conversions from autodetect_jp to UTF-8. */ for (h = 0; h < SIZEOF (handlers); h++) { enum iconv_ilseq_handler handler = handlers[h]; static const char input[] = "\244\263\244\363\244\313\244\301\244\317"; /* こんにちは in EUC-JP */ static const uint8_t expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */ for (o = 0; o < 2; o++) { size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); size_t length; uint8_t *result = u8_conv_from_encoding ("autodetect_jp", handler, input, strlen (input), offsets, NULL, &length); ASSERT (result != NULL); ASSERT (length == u8_strlen (expected)); ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0); if (o) { for (i = 0; i < 10; i++) ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1))); ASSERT (offsets[10] == MAGIC); free (offsets); } free (result); } } for (h = 0; h < SIZEOF (handlers); h++) { enum iconv_ilseq_handler handler = handlers[h]; static const char input[] = "\202\261\202\361\202\311\202\277\202\315"; /* こんにちは in Shift_JIS */ static const uint8_t expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */ for (o = 0; o < 2; o++) { size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); size_t length; uint8_t *result = u8_conv_from_encoding ("autodetect_jp", handler, input, strlen (input), offsets, NULL, &length); ASSERT (result != NULL); ASSERT (length == u8_strlen (expected)); ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0); if (o) { for (i = 0; i < 10; i++) ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1))); ASSERT (offsets[10] == MAGIC); free (offsets); } free (result); } } for (h = 0; h < SIZEOF (handlers); h++) { enum iconv_ilseq_handler handler = handlers[h]; static const char input[] = "\033$B$3$s$K$A$O\033(B"; /* こんにちは in ISO-2022-JP-2 */ static const uint8_t expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */ for (o = 0; o < 2; o++) { size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); size_t length; uint8_t *result = u8_conv_from_encoding ("autodetect_jp", handler, input, strlen (input), offsets, NULL, &length); ASSERT (result != NULL); ASSERT (length == u8_strlen (expected)); ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0); if (o) { for (i = 0; i < 16; i++) ASSERT (offsets[i] == (i == 0 ? 0 : i == 5 ? 3 : i == 7 ? 6 : i == 9 ? 9 : i == 11 ? 12 : i == 13 ? 15 : (size_t)(-1))); ASSERT (offsets[16] == MAGIC); free (offsets); } free (result); } } # endif #endif return 0; }
void ulc_possible_linebreaks (const char *s, size_t n, const char *encoding, char *p) { if (n > 0) { if (is_utf8_encoding (encoding)) u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p); else { /* Convert the string to UTF-8 and build a translation table from offsets into s to offsets into the translated string. */ size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); if (offsets != NULL) { uint8_t *t; size_t m; t = u8_conv_from_encoding (encoding, iconveh_question_mark, s, n, offsets, NULL, &m); if (t != NULL) { char *q = (char *) (m > 0 ? malloc (m) : NULL); if (m == 0 || q != NULL) { size_t i; /* Determine the possible line breaks of the UTF-8 string. */ u8_possible_linebreaks (t, m, encoding, q); /* Translate the result back to the original string. */ memset (p, UC_BREAK_PROHIBITED, n); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) p[i] = q[offsets[i]]; free (q); free (t); free (offsets); return; } free (t); } free (offsets); } /* Impossible to convert. */ #if C_CTYPE_ASCII if (is_all_ascii (s, n)) { /* ASCII is a subset of UTF-8. */ u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p); return; } #endif /* We have a non-ASCII string and cannot convert it. Don't produce line breaks except those already present in the input string. All we assume here is that the encoding is minimally ASCII compatible. */ { const char *s_end = s + n; while (s < s_end) { *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); s++; p++; } } } } }
int ulc_width_linebreaks (const char *s, size_t n, int width, int start_column, int at_end_columns, const char *o, const char *encoding, char *p) { if (n > 0) { if (is_utf8_encoding (encoding)) return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); else { /* Convert the string to UTF-8 and build a translation table from offsets into s to offsets into the translated string. */ size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); if (offsets != NULL) { uint8_t *t; size_t m; t = u8_conv_from_encoding (encoding, iconveh_question_mark, s, n, offsets, NULL, &m); if (t != NULL) { char *memory = (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL); if (m == 0 || memory != NULL) { char *q = (char *) memory; char *o8 = (o != NULL ? (char *) (q + m) : NULL); int res_column; size_t i; /* Translate the overrides to the UTF-8 string. */ if (o != NULL) { memset (o8, UC_BREAK_UNDEFINED, m); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) o8[offsets[i]] = o[i]; } /* Determine the line breaks of the UTF-8 string. */ res_column = u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q); /* Translate the result back to the original string. */ memset (p, UC_BREAK_PROHIBITED, n); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) p[i] = q[offsets[i]]; free (memory); free (t); free (offsets); return res_column; } free (t); } free (offsets); } /* Impossible to convert. */ #if C_CTYPE_ASCII if (is_all_ascii (s, n)) { /* ASCII is a subset of UTF-8. */ return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); } #endif /* We have a non-ASCII string and cannot convert it. Don't produce line breaks except those already present in the input string. All we assume here is that the encoding is minimally ASCII compatible. */ { const char *s_end = s + n; while (s < s_end) { *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); s++; p++; if (o != NULL) o++; } /* We cannot compute widths in this case. */ } } } return start_column; }
void ulc_grapheme_breaks (const char *s, size_t n, char *p) { if (n > 0) { const char *encoding = locale_charset (); if (is_utf8_encoding (encoding)) u8_grapheme_breaks ((const uint8_t *) s, n, p); else { /* Convert the string to UTF-8 and build a translation table from offsets into s to offsets into the translated string. */ size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); if (offsets != NULL) { uint8_t *t; size_t m; t = u8_conv_from_encoding (encoding, iconveh_question_mark, s, n, offsets, NULL, &m); if (t != NULL) { char *q = (char *) (m > 0 ? malloc (m) : NULL); if (m == 0 || q != NULL) { size_t i; /* Determine the grapheme breaks of the UTF-8 string. */ u8_grapheme_breaks (t, m, q); /* Translate the result back to the original string. */ memset (p, 0, n); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) p[i] = q[offsets[i]]; free (q); free (t); free (offsets); return; } free (t); } free (offsets); } /* Impossible to convert. */ #if C_CTYPE_ASCII /* Fall back to ASCII as best we can. */ ascii_grapheme_breaks (s, n, p); #else /* We cannot make any assumptions. */ p[0] = 1; memset (p + 1, 0, n - 1); #endif } } }
char * unicode_fixup_string(char *str) { uint8_t *ret; size_t len; if (!str) return NULL; len = strlen(str); /* String is valid UTF-8 */ if (!u8_check((uint8_t *)str, len)) return str; ret = u8_conv_from_encoding("ascii", iconveh_question_mark, str, len, NULL, NULL, &len); if (!ret) { DPRINTF(E_LOG, L_MISC, "Could not convert string '%s' to UTF-8: %s\n", str, strerror(errno)); return NULL; } return (char *)ret; }
char * unicode_fixup_string(char *str) { uint8_t *ret; size_t len; if (!str) return NULL; len = strlen(str); /* String is valid UTF-8 */ if (!u8_check((uint8_t *)str, len)) { if (len >= 3) { /* Check for and strip byte-order mark */ if (memcmp("\xef\xbb\xbf", str, 3) == 0) memmove(str, str + 3, len - 3 + 1); } return str; } ret = u8_conv_from_encoding("ascii", iconveh_question_mark, str, len, NULL, NULL, &len); if (!ret) { DPRINTF(E_LOG, L_MISC, "Could not convert string '%s' to UTF-8: %s\n", str, strerror(errno)); return NULL; } return (char *)ret; }