Example #1
0
void
u16_grapheme_breaks (const uint16_t *s, size_t n, char *p)
{
  ucs4_t prev;
  int mblen;

  prev = 0;
  for (; n > 0; s += mblen, p += mblen, n -= mblen)
    {
      ucs4_t next;

      mblen = u16_mbtouc (&next, s, n);

      p[0] = uc_is_grapheme_break (prev, next);
      if (mblen > 1)
        p[1] = 0;

      prev = next;
    }
}
Example #2
0
static char *
read_escaped_string (bool in_key)
{
  static unsigned short *buffer;
  static size_t bufmax;
  static size_t buflen;
  int c;

  /* Skip whitespace before the string.  */
  do
    c = phase3_getc ();
  while (c == ' ' || c == '\t' || c == '\r' || c == '\f');

  if (c == EOF || c == '\n')
    /* Empty string.  */
    return NULL;

  /* Start accumulating the string.  We store the string in UTF-16 before
     converting it to UTF-8.  Why not converting every character directly to
     UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
     we must combine them to a single UTF-8 character.  */
  buflen = 0;
  for (;;)
    {
      if (in_key && (c == '=' || c == ':'
		     || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
	{
	  /* Skip whitespace after the string.  */
	  while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
	    c = phase3_getc ();
	  /* Skip '=' or ':' separator.  */
	  if (!(c == '=' || c == ':'))
	    phase3_ungetc (c);
	  break;
	}

      phase3_ungetc (c);

      /* Read the next UTF-16 codepoint.  */
      c = phase4_getuc ();
      if (c < 0)
	break;
      /* Append it to the buffer.  */
      if (buflen >= bufmax)
	{
	  bufmax += 100;
	  buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
	}
      buffer[buflen++] = c;

      c = phase3_getc ();
      if (c == EOF || c == '\n')
	{
	  if (in_key)
	    phase3_ungetc (c);
	  break;
	}
    }

  /* Now convert from UTF-16 to UTF-8.  */
  {
    size_t pos;
    unsigned char *utf8_string;
    unsigned char *q;

    /* Each UTF-16 word needs 3 bytes at worst.  */
    utf8_string = (unsigned char *) xmalloc (3 * buflen + 1);
    for (pos = 0, q = utf8_string; pos < buflen; )
      {
	unsigned int uc;
	int n;

	pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
	n = u8_uctomb (q, uc, 6);
	assert (n > 0);
	q += n;
      }
    *q = '\0';
    assert (q - utf8_string <= 3 * buflen);

    return (char *) utf8_string;
  }
}