void u16_grapheme_breaks (const uint16_t *s, size_t n, char *p) { ucs4_t prev; int mblen; prev = 0; for (; n > 0; s += mblen, p += mblen, n -= mblen) { ucs4_t next; mblen = u16_mbtouc (&next, s, n); p[0] = uc_is_grapheme_break (prev, next); if (mblen > 1) p[1] = 0; prev = next; } }
static char * read_escaped_string (bool in_key) { static unsigned short *buffer; static size_t bufmax; static size_t buflen; int c; /* Skip whitespace before the string. */ do c = phase3_getc (); while (c == ' ' || c == '\t' || c == '\r' || c == '\f'); if (c == EOF || c == '\n') /* Empty string. */ return NULL; /* Start accumulating the string. We store the string in UTF-16 before converting it to UTF-8. Why not converting every character directly to UTF-8? Because a string can contain surrogates like \uD800\uDF00, and we must combine them to a single UTF-8 character. */ buflen = 0; for (;;) { if (in_key && (c == '=' || c == ':' || c == ' ' || c == '\t' || c == '\r' || c == '\f')) { /* Skip whitespace after the string. */ while (c == ' ' || c == '\t' || c == '\r' || c == '\f') c = phase3_getc (); /* Skip '=' or ':' separator. */ if (!(c == '=' || c == ':')) phase3_ungetc (c); break; } phase3_ungetc (c); /* Read the next UTF-16 codepoint. */ c = phase4_getuc (); if (c < 0) break; /* Append it to the buffer. */ if (buflen >= bufmax) { bufmax += 100; buffer = xrealloc (buffer, bufmax * sizeof (unsigned short)); } buffer[buflen++] = c; c = phase3_getc (); if (c == EOF || c == '\n') { if (in_key) phase3_ungetc (c); break; } } /* Now convert from UTF-16 to UTF-8. */ { size_t pos; unsigned char *utf8_string; unsigned char *q; /* Each UTF-16 word needs 3 bytes at worst. */ utf8_string = (unsigned char *) xmalloc (3 * buflen + 1); for (pos = 0, q = utf8_string; pos < buflen; ) { unsigned int uc; int n; pos += u16_mbtouc (&uc, buffer + pos, buflen - pos); n = u8_uctomb (q, uc, 6); assert (n > 0); q += n; } *q = '\0'; assert (q - utf8_string <= 3 * buflen); return (char *) utf8_string; } }