示例#1
0
/* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
static char *
conv_from_iso_8859_1 (char *string)
{
  if (is_ascii_string (string))
    return string;
  else
    {
      size_t length = strlen (string);
      /* Each ISO-8859-1 character needs 2 bytes at worst.  */
      unsigned char *utf8_string = (unsigned char *) xmalloc (2 * length + 1);
      unsigned char *q = utf8_string;
      const char *str = string;
      const char *str_limit = str + length;

      while (str < str_limit)
	{
	  unsigned int uc = (unsigned char) *str++;
	  int n = u8_uctomb (q, uc, 6);
	  assert (n > 0);
	  q += n;
	}
      *q = '\0';
      assert (q - utf8_string <= 2 * length);

      return (char *) utf8_string;
    }
}
示例#2
0
文件: ustring.c 项目: Datikos/RHVoice
ustring8_t ustring8_push(ustring8_t u,ucs4_t c)
{
  uint8_t b[8]={'\0'};
  int n=u8_uctomb(b,c,8);
  if(n<=0) return NULL;
  if(!uchars8_reserve(u,uchars8_size(u)+n)) return NULL;
  uchars8_pop(u);
  uchars8_append(u,b,n+1);
  return u;
}
示例#3
0
文件: russian.c 项目: Datikos/RHVoice
cst_val* ustring32_lts_apply(const ustring32_t u32,const cst_lts_rewrites *rule)
{
  size_t n=ustring32_length(u32);
  if(n==0) return NULL;
  cst_val *l=cons_val(string_val("#"),NULL);
  uint8_t b[8];
  size_t i=n;
  int k;
  do
    {
      i--;
      k=u8_uctomb(b,ustring32_at(u32,i),sizeof(b));
      b[k]='\0';
      l=cons_val(string_val((char*)b),l);
    }
  while(i);
  l=cons_val(string_val("#"),l);
  cst_val *output=lts_rewrites(l, rule);
  delete_val(l);
  return output;
}
示例#4
0
/* utf8conv_carefully is like iconv, except that
     - it converts from UTF-8 to UTF-8,
     - it stops as soon as it encounters a conversion error, and it returns
       in *INCREMENTED a boolean telling whether it has incremented the input
       pointers past the error location,
     - if one_character_only is true, it stops after converting one
       character.  */
static size_t
utf8conv_carefully (bool one_character_only,
                    const char **inbuf, size_t *inbytesleft,
                    char **outbuf, size_t *outbytesleft,
                    bool *incremented)
{
  const char *inptr = *inbuf;
  size_t insize = *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res;

  res = 0;
  do
    {
      ucs4_t uc;
      int n;
      int m;

      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
      if (n < 0)
        {
          errno = (n == -2 ? EINVAL : EILSEQ);
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
          inptr += n;
          insize -= n;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      if (outsize == 0)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
      if (m == -2)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      inptr += n;
      insize -= n;
      if (m == -1)
        {
          errno = EILSEQ;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      outptr += m;
      outsize -= m;
    }
  while (!one_character_only && insize > 0);

  *inbuf = inptr;
  *inbytesleft = insize;
  *outbuf = outptr;
  *outbytesleft = outsize;
  return res;
}
示例#5
0
int
main ()
{
  /* Test ISO 646 character, in particular the NUL character.  */
  {
    ucs4_t uc;

    for (uc = 0; uc < 0x80; uc++)
      {
        uint8_t buf[5] = { MAGIC, MAGIC, MAGIC, MAGIC, MAGIC };
        int ret;

        ret = u8_uctomb (buf, uc, 0);
        ASSERT (ret == -2);
        ASSERT (buf[0] == MAGIC);

        ret = u8_uctomb (buf, uc, 1);
        ASSERT (ret == 1);
        ASSERT (buf[0] == uc);
        ASSERT (buf[1] == MAGIC);
      }
  }

  /* Test 2-byte character.  */
  {
    ucs4_t uc = 0x00D7;
    uint8_t buf[5] = { MAGIC, MAGIC, MAGIC, MAGIC, MAGIC };
    int ret;

    ret = u8_uctomb (buf, uc, 0);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);

    ret = u8_uctomb (buf, uc, 1);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);

    ret = u8_uctomb (buf, uc, 2);
    ASSERT (ret == 2);
    ASSERT (buf[0] == 0xC3);
    ASSERT (buf[1] == 0x97);
    ASSERT (buf[2] == MAGIC);
  }

  /* Test 3-byte character.  */
  {
    ucs4_t uc = 0x20AC;
    uint8_t buf[5] = { MAGIC, MAGIC, MAGIC, MAGIC, MAGIC };
    int ret;

    ret = u8_uctomb (buf, uc, 0);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);

    ret = u8_uctomb (buf, uc, 1);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);

    ret = u8_uctomb (buf, uc, 2);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);
    ASSERT (buf[1] == MAGIC);

    ret = u8_uctomb (buf, uc, 3);
    ASSERT (ret == 3);
    ASSERT (buf[0] == 0xE2);
    ASSERT (buf[1] == 0x82);
    ASSERT (buf[2] == 0xAC);
    ASSERT (buf[3] == MAGIC);
  }

  /* Test 4-byte character.  */
  {
    ucs4_t uc = 0x10FFFD;
    uint8_t buf[5] = { MAGIC, MAGIC, MAGIC, MAGIC, MAGIC };
    int ret;

    ret = u8_uctomb (buf, uc, 0);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);

    ret = u8_uctomb (buf, uc, 1);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);

    ret = u8_uctomb (buf, uc, 2);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);
    ASSERT (buf[1] == MAGIC);

    ret = u8_uctomb (buf, uc, 3);
    ASSERT (ret == -2);
    ASSERT (buf[0] == MAGIC);
    ASSERT (buf[1] == MAGIC);
    ASSERT (buf[2] == MAGIC);

    ret = u8_uctomb (buf, uc, 4);
    ASSERT (ret == 4);
    ASSERT (buf[0] == 0xF4);
    ASSERT (buf[1] == 0x8F);
    ASSERT (buf[2] == 0xBF);
    ASSERT (buf[3] == 0xBD);
    ASSERT (buf[4] == MAGIC);
  }

  /* Test invalid characters.  */
  {
    ucs4_t invalid[] = { 0x110000, 0xD800, 0xDBFF, 0xDC00, 0xDFFF };
    uint8_t buf[5] = { MAGIC, MAGIC, MAGIC, MAGIC, MAGIC };
    size_t i;

    for (i = 0; i < SIZEOF (invalid); i++)
      {
        ucs4_t uc = invalid[i];
        int n;

        for (n = 0; n <= 4; n++)
          {
            int ret = u8_uctomb (buf, uc, n);
            ASSERT (ret == -1);
            ASSERT (buf[0] == MAGIC);
            ASSERT (buf[1] == MAGIC);
            ASSERT (buf[2] == MAGIC);
            ASSERT (buf[3] == MAGIC);
            ASSERT (buf[4] == MAGIC);
          }
      }
  }

  return 0;
}
static DST_UNIT *
FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp)
{
  const SRC_UNIT *s_end = s + n;
  /* Output string accumulator.  */
  DST_UNIT *result;
  size_t allocated;
  size_t length;

  if (resultbuf != NULL)
    {
      result = resultbuf;
      allocated = *lengthp;
    }
  else
    {
      result = NULL;
      allocated = 0;
    }
  length = 0;
  /* Invariants:
     result is either == resultbuf or == NULL or malloc-allocated.
     If length > 0, then result != NULL.  */

  while (s < s_end)
    {
      ucs4_t uc;
      int count;

      /* Fetch a Unicode character from the input string.  */
      count = u16_mbtoucr (&uc, s, s_end - s);
      if (count < 0)
        {
          if (count == -2)
            /* Incomplete sequence of units.  */
            break;
          if (!(result == resultbuf || result == NULL))
            free (result);
          errno = EILSEQ;
          return NULL;
        }
      s += count;

      /* Store it in the output string.  */
      count = u8_uctomb (result + length, uc, allocated - length);
      if (count == -1)
        {
          if (!(result == resultbuf || result == NULL))
            free (result);
          errno = EILSEQ;
          return NULL;
        }
      if (count == -2)
        {
          DST_UNIT *memory;

          allocated = (allocated > 0 ? 2 * allocated : 12);
          if (length + 6 > allocated)
            allocated = length + 6;
          if (result == resultbuf || result == NULL)
            memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT));
          else
            memory =
              (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT));

          if (memory == NULL)
            {
              if (!(result == resultbuf || result == NULL))
                free (result);
              errno = ENOMEM;
              return NULL;
            }
          if (result == resultbuf && length > 0)
            memcpy ((char *) memory, (char *) result,
                    length * sizeof (DST_UNIT));
          result = memory;
          count = u8_uctomb (result + length, uc, allocated - length);
          if (count < 0)
            abort ();
        }
      length += count;
    }

  if (length == 0)
    {
      if (result == NULL)
        {
          /* Return a non-NULL value.  NULL means error.  */
          result = (DST_UNIT *) malloc (1);
          if (result == NULL)
            {
              errno = ENOMEM;
              return NULL;
            }
        }
    }
  else if (result != resultbuf && length < allocated)
    {
      /* Shrink the allocated memory if possible.  */
      DST_UNIT *memory;

      memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT));
      if (memory != NULL)
        result = memory;
    }

  *lengthp = length;
  return result;
}
示例#7
0
/* Converts the Unicode character CODE to its multibyte representation
   in the current locale and calls the SUCCESS callback on the resulting
   byte sequence.  If an error occurs, invokes the FAILURE callback instead,
   passing it CODE and an English error string.
   Returns whatever the callback returned.
   Assumes that the locale doesn't change between two calls.  */
long
unicode_to_mb (unsigned int code,
               long (*success) (const char *buf, size_t buflen,
                                void *callback_arg),
               long (*failure) (unsigned int code, const char *msg,
                                void *callback_arg),
               void *callback_arg)
{
  static int initialized;
  static int is_utf8;
#if HAVE_ICONV
  static iconv_t utf8_to_local;
#endif

  char inbuf[6];
  int count;

  if (!initialized)
    {
      const char *charset = locale_charset ();

      is_utf8 = !strcmp (charset, UTF8_NAME);
#if HAVE_ICONV
      if (!is_utf8)
        {
          utf8_to_local = iconv_open (charset, UTF8_NAME);
          if (utf8_to_local == (iconv_t)(-1))
            /* For an unknown encoding, assume ASCII.  */
            utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
        }
#endif
      initialized = 1;
    }

  /* Test whether the utf8_to_local converter is available at all.  */
  if (!is_utf8)
    {
#if HAVE_ICONV
      if (utf8_to_local == (iconv_t)(-1))
        return failure (code, N_("iconv function not usable"), callback_arg);
#else
      return failure (code, N_("iconv function not available"), callback_arg);
#endif
    }

  /* Convert the character to UTF-8.  */
  count = u8_uctomb ((unsigned char *) inbuf, code, sizeof (inbuf));
  if (count < 0)
    return failure (code, N_("character out of range"), callback_arg);

#if HAVE_ICONV
  if (!is_utf8)
    {
      char outbuf[25];
      const char *inptr;
      size_t inbytesleft;
      char *outptr;
      size_t outbytesleft;
      size_t res;

      inptr = inbuf;
      inbytesleft = count;
      outptr = outbuf;
      outbytesleft = sizeof (outbuf);

      /* Convert the character from UTF-8 to the locale's charset.  */
      res = iconv (utf8_to_local,
                   (ICONV_CONST char **)&inptr, &inbytesleft,
                   &outptr, &outbytesleft);
      if (inbytesleft > 0 || res == (size_t)(-1)
          /* Irix iconv() inserts a NUL byte if it cannot convert. */
# if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
          || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
# endif
         )
        return failure (code, NULL, callback_arg);

      /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
# if defined _LIBICONV_VERSION \
    || !(((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) \
          && !defined __UCLIBC__) \
         || defined __sun)

      /* Get back to the initial shift state.  */
      res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
      if (res == (size_t)(-1))
        return failure (code, NULL, callback_arg);
# endif

      return success (outbuf, outptr - outbuf, callback_arg);
    }
#endif

  /* At this point, is_utf8 is true, so no conversion is needed.  */
  return success (inbuf, count, callback_arg);
}
示例#8
0
static char *
read_escaped_string (bool in_key)
{
  static unsigned short *buffer;
  static size_t bufmax;
  static size_t buflen;
  int c;

  /* Skip whitespace before the string.  */
  do
    c = phase3_getc ();
  while (c == ' ' || c == '\t' || c == '\r' || c == '\f');

  if (c == EOF || c == '\n')
    /* Empty string.  */
    return NULL;

  /* Start accumulating the string.  We store the string in UTF-16 before
     converting it to UTF-8.  Why not converting every character directly to
     UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
     we must combine them to a single UTF-8 character.  */
  buflen = 0;
  for (;;)
    {
      if (in_key && (c == '=' || c == ':'
		     || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
	{
	  /* Skip whitespace after the string.  */
	  while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
	    c = phase3_getc ();
	  /* Skip '=' or ':' separator.  */
	  if (!(c == '=' || c == ':'))
	    phase3_ungetc (c);
	  break;
	}

      phase3_ungetc (c);

      /* Read the next UTF-16 codepoint.  */
      c = phase4_getuc ();
      if (c < 0)
	break;
      /* Append it to the buffer.  */
      if (buflen >= bufmax)
	{
	  bufmax += 100;
	  buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
	}
      buffer[buflen++] = c;

      c = phase3_getc ();
      if (c == EOF || c == '\n')
	{
	  if (in_key)
	    phase3_ungetc (c);
	  break;
	}
    }

  /* Now convert from UTF-16 to UTF-8.  */
  {
    size_t pos;
    unsigned char *utf8_string;
    unsigned char *q;

    /* Each UTF-16 word needs 3 bytes at worst.  */
    utf8_string = (unsigned char *) xmalloc (3 * buflen + 1);
    for (pos = 0, q = utf8_string; pos < buflen; )
      {
	unsigned int uc;
	int n;

	pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
	n = u8_uctomb (q, uc, 6);
	assert (n > 0);
	q += n;
      }
    *q = '\0';
    assert (q - utf8_string <= 3 * buflen);

    return (char *) utf8_string;
  }
}
示例#9
0
/* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
   encoding.  May destructively modify the argument string.  */
static char *
conv_from_java (char *string)
{
  /* This conversion can only shrink the string, never increase its size.
     So there is no need to xmalloc the result freshly.  */
  const char *p = string;
  unsigned char *q = (unsigned char *) string;

  while (*p != '\0')
    {
      if (p[0] == '\\' && p[1] == 'u')
	{
	  unsigned int n = 0;
	  int i;

	  for (i = 0; i < 4; i++)
	    {
	      int c1 = (unsigned char) p[2 + i];

	      if (c1 >= '0' && c1 <= '9')
		n = (n << 4) + (c1 - '0');
	      else if (c1 >= 'A' && c1 <= 'F')
		n = (n << 4) + (c1 - 'A' + 10);
	      else if (c1 >= 'a' && c1 <= 'f')
		n = (n << 4) + (c1 - 'a' + 10);
	      else
		goto just_one_byte;
	    }

	  if (i == 4)
	    {
	      unsigned int uc;

	      if (n >= 0xd800 && n < 0xdc00)
		{
		  if (p[6] == '\\' && p[7] == 'u')
		    {
		      unsigned int m = 0;

		      for (i = 0; i < 4; i++)
			{
			  int c1 = (unsigned char) p[8 + i];

			  if (c1 >= '0' && c1 <= '9')
			    m = (m << 4) + (c1 - '0');
			  else if (c1 >= 'A' && c1 <= 'F')
			    m = (m << 4) + (c1 - 'A' + 10);
			  else if (c1 >= 'a' && c1 <= 'f')
			    m = (m << 4) + (c1 - 'a' + 10);
			  else
			    goto just_one_byte;
			}

		      if (i == 4 && (m >= 0xdc00 && m < 0xe000))
			{
			  /* Combine two UTF-16 words to a character.  */
			  uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
			  p += 12;
			}
		      else
			goto just_one_byte;
		    }
		  else
		    goto just_one_byte;
		}
	      else
		{
		  uc = n;
		  p += 6;
		}

	      q += u8_uctomb (q, uc, 6);
	      continue;
	    }
	}
      just_one_byte:
	*q++ = (unsigned char) *p++;
    }
  *q = '\0';
  return string;
}