示例#1
0
/**
 * Convert the len characters long character sequence
 * given in input that is in the given input charset
 * to a string in given output charset.
 *
 * @param input input string
 * @param len number of bytes in @a input
 * @param input_charset character set used for @a input
 * @param output_charset desired character set for the return value
 * @return the converted string (0-terminated),
 *  if conversion fails, a copy of the orignal
 *  string is returned.
 */
char *
GNUNET_STRINGS_conv (const char *input,
		     size_t len,
		     const char *input_charset,
		     const char *output_charset)
{
  char *ret;
  uint8_t *u8_string;
  char *encoded_string;
  size_t u8_string_length;
  size_t encoded_string_length;

  u8_string = u8_conv_from_encoding (input_charset,
				     iconveh_error,
				     input, len,
				     NULL, NULL,
				     &u8_string_length);
  if (NULL == u8_string)
  {
    LOG_STRERROR (GNUNET_ERROR_TYPE_WARNING, "u8_conv_from_encoding");
    goto fail;
  }
  if (0 == strcmp (output_charset, "UTF-8"))
  {
    ret = GNUNET_malloc (u8_string_length + 1);
    memcpy (ret, u8_string, u8_string_length);
    ret[u8_string_length] = '\0';
    free (u8_string);
    return ret;
  }
  encoded_string = u8_conv_to_encoding (output_charset, iconveh_error,
					u8_string, u8_string_length,
					NULL, NULL,
					&encoded_string_length);
  free (u8_string);
  if (NULL == encoded_string)
  {
    LOG_STRERROR (GNUNET_ERROR_TYPE_WARNING, "u8_conv_to_encoding");
    goto fail;
  }
  ret = GNUNET_malloc (encoded_string_length + 1);
  memcpy (ret, encoded_string, encoded_string_length);
  ret[encoded_string_length] = '\0';
  free (encoded_string);
  return ret;
 fail:
  LOG (GNUNET_ERROR_TYPE_WARNING, _("Character sets requested were `%s'->`%s'\n"),
       "UTF-8", output_charset);
  ret = GNUNET_malloc (len + 1);
  memcpy (ret, input, len);
  ret[len] = '\0';
  return ret;
}
示例#2
0
static uint8_t *
ulc_u8_casefold (const char *s, size_t n, const char *iso639_language,
                 uninorm_t nf,
                 uint8_t *resultbuf, size_t *lengthp)
{
  uint8_t convbuf[2048 / sizeof (uint8_t)];
  uint8_t *conv;
  size_t conv_length;
  uint8_t *result;

  /* Convert the string to UTF-8.  */
  conv_length = sizeof (convbuf) / sizeof (uint8_t);
  conv =
    u8_conv_from_encoding (locale_charset (), iconveh_error, s, n, NULL,
                           convbuf, &conv_length);
  if (conv == NULL)
    /* errno is set here.  */
    return NULL;

  /* Case-fold and normalize.  */
  result = u8_casefold (conv, conv_length, iso639_language, nf,
                        resultbuf, lengthp);
  if (result == NULL)
    {
      if (conv != convbuf)
        {
          int saved_errno = errno;
          free (conv);
          errno = saved_errno;
        }
      return NULL;
    }

  if (conv != convbuf)
    free (conv);
  return result;
}
示例#3
0
void
ulc_wordbreaks (const char *s, size_t n, char *p)
{
  if (n > 0)
    {
      const char *encoding = locale_charset ();

      if (is_utf8_encoding (encoding))
        u8_wordbreaks ((const uint8_t *) s, n, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              uint8_t *t;
              size_t m;

              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
                                         s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *q = (char *) (m > 0 ? malloc (m) : NULL);

                  if (m == 0 || q != NULL)
                    {
                      size_t i;

                      /* Determine the word breaks of the UTF-8 string.  */
                      u8_wordbreaks (t, m, q);

                      /* Translate the result back to the original string.  */
                      memset (p, 0, n);
                      for (i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (q);
                      free (t);
                      free (offsets);
                      return;
                    }
                  free (t);
                }
              free (offsets);
            }

          /* Impossible to convert.  */
#if C_CTYPE_ASCII
          if (is_all_ascii (s, n))
            {
              /* ASCII is a subset of UTF-8.  */
              u8_wordbreaks ((const uint8_t *) s, n, p);
              return;
            }
#endif
          /* We have a non-ASCII string and cannot convert it.
             Don't produce any word breaks.  */
          memset (p, 0, n);
        }
    }
}
int
main ()
{
  static enum iconv_ilseq_handler handlers[] =
    { iconveh_error, iconveh_question_mark, iconveh_escape_sequence };
  size_t h;
  size_t o;
  size_t i;

#if HAVE_ICONV
  /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
     ISO-8859-2, and UTF-8.  */

  /* Test conversion from ISO-8859-1 to UTF-8 with no errors.  */
  for (h = 0; h < SIZEOF (handlers); h++)
    {
      enum iconv_ilseq_handler handler = handlers[h];
      static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
      static const uint8_t expected[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
      for (o = 0; o < 2; o++)
        {
          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
          size_t length;
          uint8_t *result = u8_conv_from_encoding ("ISO-8859-1", handler,
                                                   input, strlen (input),
                                                   offsets,
                                                   NULL, &length);
          ASSERT (result != NULL);
          ASSERT (length == u8_strlen (expected));
          ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0);
          if (o)
            {
              for (i = 0; i < 37; i++)
                ASSERT (offsets[i] == (i < 1 ? i :
                                       i < 12 ? i + 1 :
                                       i < 18 ? i + 2 :
                                       i + 3));
              ASSERT (offsets[37] == MAGIC);
              free (offsets);
            }
          free (result);
        }
    }

  /* Test conversion from ISO-8859-2 to UTF-8 with no errors.  */
  for (h = 0; h < SIZEOF (handlers); h++)
    {
      enum iconv_ilseq_handler handler = handlers[h];
      static const char input[] = "Rafa\263 Maszkowski"; /* Rafał Maszkowski */
      static const uint8_t expected[] = "Rafa\305\202 Maszkowski";
      for (o = 0; o < 2; o++)
        {
          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
          size_t length;
          uint8_t *result = u8_conv_from_encoding ("ISO-8859-2", handler,
                                                   input, strlen (input),
                                                   offsets,
                                                   NULL, &length);
          ASSERT (result != NULL);
          ASSERT (length == u8_strlen (expected));
          ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0);
          if (o)
            {
              for (i = 0; i < 16; i++)
                ASSERT (offsets[i] == (i < 5 ? i :
                                       i + 1));
              ASSERT (offsets[16] == MAGIC);
              free (offsets);
            }
          free (result);
        }
    }

  /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2.  */
# if defined _LIBICONV_VERSION || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun)
  /* Test conversions from autodetect_jp to UTF-8.  */
  for (h = 0; h < SIZEOF (handlers); h++)
    {
      enum iconv_ilseq_handler handler = handlers[h];
      static const char input[] = "\244\263\244\363\244\313\244\301\244\317"; /* こんにちは in EUC-JP */
      static const uint8_t expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
      for (o = 0; o < 2; o++)
        {
          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
          size_t length;
          uint8_t *result = u8_conv_from_encoding ("autodetect_jp", handler,
                                                   input, strlen (input),
                                                   offsets,
                                                   NULL, &length);
          ASSERT (result != NULL);
          ASSERT (length == u8_strlen (expected));
          ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0);
          if (o)
            {
              for (i = 0; i < 10; i++)
                ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1)));
              ASSERT (offsets[10] == MAGIC);
              free (offsets);
            }
          free (result);
        }
    }
  for (h = 0; h < SIZEOF (handlers); h++)
    {
      enum iconv_ilseq_handler handler = handlers[h];
      static const char input[] = "\202\261\202\361\202\311\202\277\202\315"; /* こんにちは in Shift_JIS */
      static const uint8_t expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
      for (o = 0; o < 2; o++)
        {
          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
          size_t length;
          uint8_t *result = u8_conv_from_encoding ("autodetect_jp", handler,
                                                   input, strlen (input),
                                                   offsets,
                                                   NULL, &length);
          ASSERT (result != NULL);
          ASSERT (length == u8_strlen (expected));
          ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0);
          if (o)
            {
              for (i = 0; i < 10; i++)
                ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1)));
              ASSERT (offsets[10] == MAGIC);
              free (offsets);
            }
          free (result);
        }
    }
  for (h = 0; h < SIZEOF (handlers); h++)
    {
      enum iconv_ilseq_handler handler = handlers[h];
      static const char input[] = "\033$B$3$s$K$A$O\033(B"; /* こんにちは in ISO-2022-JP-2 */
      static const uint8_t expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
      for (o = 0; o < 2; o++)
        {
          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
          size_t length;
          uint8_t *result = u8_conv_from_encoding ("autodetect_jp", handler,
                                                   input, strlen (input),
                                                   offsets,
                                                   NULL, &length);
          ASSERT (result != NULL);
          ASSERT (length == u8_strlen (expected));
          ASSERT (u8_cmp (result, expected, u8_strlen (expected)) == 0);
          if (o)
            {
              for (i = 0; i < 16; i++)
                ASSERT (offsets[i] == (i == 0 ? 0 :
                                       i == 5 ? 3 :
                                       i == 7 ? 6 :
                                       i == 9 ? 9 :
                                       i == 11 ? 12 :
                                       i == 13 ? 15 :
                                       (size_t)(-1)));
              ASSERT (offsets[16] == MAGIC);
              free (offsets);
            }
          free (result);
        }
    }
# endif

#endif

  return 0;
}
示例#5
0
void
ulc_possible_linebreaks (const char *s, size_t n, const char *encoding,
                         char *p)
{
  if (n > 0)
    {
      if (is_utf8_encoding (encoding))
        u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              uint8_t *t;
              size_t m;

              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
                                         s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *q = (char *) (m > 0 ? malloc (m) : NULL);

                  if (m == 0 || q != NULL)
                    {
                      size_t i;

                      /* Determine the possible line breaks of the UTF-8
                         string.  */
                      u8_possible_linebreaks (t, m, encoding, q);

                      /* Translate the result back to the original string.  */
                      memset (p, UC_BREAK_PROHIBITED, n);
                      for (i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (q);
                      free (t);
                      free (offsets);
                      return;
                    }
                  free (t);
                }
              free (offsets);
            }

          /* Impossible to convert.  */
#if C_CTYPE_ASCII
          if (is_all_ascii (s, n))
            {
              /* ASCII is a subset of UTF-8.  */
              u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
              return;
            }
#endif
          /* We have a non-ASCII string and cannot convert it.
             Don't produce line breaks except those already present in the
             input string.  All we assume here is that the encoding is
             minimally ASCII compatible.  */
          {
            const char *s_end = s + n;
            while (s < s_end)
              {
                *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
                s++;
                p++;
              }
          }
        }
    }
}
示例#6
0
int
ulc_width_linebreaks (const char *s, size_t n,
                      int width, int start_column, int at_end_columns,
                      const char *o, const char *encoding,
                      char *p)
{
  if (n > 0)
    {
      if (is_utf8_encoding (encoding))
        return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              uint8_t *t;
              size_t m;

              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
                                         s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *memory =
                    (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);

                  if (m == 0 || memory != NULL)
                    {
                      char *q = (char *) memory;
                      char *o8 = (o != NULL ? (char *) (q + m) : NULL);
                      int res_column;
                      size_t i;

                      /* Translate the overrides to the UTF-8 string.  */
                      if (o != NULL)
                        {
                          memset (o8, UC_BREAK_UNDEFINED, m);
                          for (i = 0; i < n; i++)
                            if (offsets[i] != (size_t)(-1))
                              o8[offsets[i]] = o[i];
                        }

                      /* Determine the line breaks of the UTF-8 string.  */
                      res_column =
                        u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);

                      /* Translate the result back to the original string.  */
                      memset (p, UC_BREAK_PROHIBITED, n);
                      for (i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (memory);
                      free (t);
                      free (offsets);
                      return res_column;
                    }
                  free (t);
                }
              free (offsets);
            }
          /* Impossible to convert.  */
#if C_CTYPE_ASCII
          if (is_all_ascii (s, n))
            {
              /* ASCII is a subset of UTF-8.  */
              return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
            }
#endif
          /* We have a non-ASCII string and cannot convert it.
             Don't produce line breaks except those already present in the
             input string.  All we assume here is that the encoding is
             minimally ASCII compatible.  */
          {
            const char *s_end = s + n;
            while (s < s_end)
              {
                *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
                      ? UC_BREAK_MANDATORY
                      : UC_BREAK_PROHIBITED);
                s++;
                p++;
                if (o != NULL)
                  o++;
              }
            /* We cannot compute widths in this case.  */
          }
        }
    }
  return start_column;
}
void
ulc_grapheme_breaks (const char *s, size_t n, char *p)
{
  if (n > 0)
    {
      const char *encoding = locale_charset ();

      if (is_utf8_encoding (encoding))
        u8_grapheme_breaks ((const uint8_t *) s, n, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              uint8_t *t;
              size_t m;

              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
                                         s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *q = (char *) (m > 0 ? malloc (m) : NULL);

                  if (m == 0 || q != NULL)
                    {
                      size_t i;

                      /* Determine the grapheme breaks of the UTF-8 string.  */
                      u8_grapheme_breaks (t, m, q);

                      /* Translate the result back to the original string.  */
                      memset (p, 0, n);
                      for (i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (q);
                      free (t);
                      free (offsets);
                      return;
                    }
                  free (t);
                }
              free (offsets);
            }

          /* Impossible to convert. */
#if C_CTYPE_ASCII
          /* Fall back to ASCII as best we can. */
          ascii_grapheme_breaks (s, n, p);
#else
          /* We cannot make any assumptions. */
          p[0] = 1;
          memset (p + 1, 0, n - 1);
#endif
        }
    }
}
示例#8
-1
char *
unicode_fixup_string(char *str)
{
  uint8_t *ret;
  size_t len;

  if (!str)
    return NULL;

  len = strlen(str);

  /* String is valid UTF-8 */
  if (!u8_check((uint8_t *)str, len))
    return str;

  ret = u8_conv_from_encoding("ascii", iconveh_question_mark, str, len, NULL, NULL, &len);
  if (!ret)
    {
      DPRINTF(E_LOG, L_MISC, "Could not convert string '%s' to UTF-8: %s\n", str, strerror(errno));

      return NULL;
    }

  return (char *)ret;
}
示例#9
-1
char *
unicode_fixup_string(char *str)
{
  uint8_t *ret;
  size_t len;

  if (!str)
    return NULL;

  len = strlen(str);

  /* String is valid UTF-8 */
  if (!u8_check((uint8_t *)str, len))
    {
      if (len >= 3)
	{
	  /* Check for and strip byte-order mark */
	  if (memcmp("\xef\xbb\xbf", str, 3) == 0)
	    memmove(str, str + 3, len - 3 + 1);
	}

      return str;
    }

  ret = u8_conv_from_encoding("ascii", iconveh_question_mark, str, len, NULL, NULL, &len);
  if (!ret)
    {
      DPRINTF(E_LOG, L_MISC, "Could not convert string '%s' to UTF-8: %s\n", str, strerror(errno));

      return NULL;
    }

  return (char *)ret;
}