static bool is_all_utf8_text (const void *s_, size_t n) { const uint8_t *s = s_; size_t ofs; ofs = 0; while (ofs < n) { uint8_t c = s[ofs]; if (c < 0x80) { if (!encoding_guess_is_ascii_text (c)) return false; ofs++; } else { ucs4_t uc; int mblen; mblen = u8_mbtoucr (&uc, s + ofs, n - ofs); if (mblen < 0) return mblen == -2; ofs += mblen; } } return true; }
size_t u8_mbsnlen (const uint8_t *s, size_t n) { size_t characters; characters = 0; while (n > 0) { ucs4_t uc; int count = u8_mbtoucr (&uc, s, n); characters++; if (count == -2) break; if (count < 0) count = u8_mbtouc (&uc, s, n); else if (count == 0) count = 1; s += count; n -= count; } return characters; }
static void check_segmentation (const char *input, size_t length, bool print_segments) { size_t offset, line_number, line_offset; struct segmenter s; int prev_type; segmenter_init (&s, mode); line_number = 1; line_offset = 0; prev_type = -1; for (offset = 0; offset < length; ) { enum segment_type type; const char *type_name, *p; int n; if (one_byte) { int n_newlines = 0; int i; for (i = 0; i <= length - offset; i++) { /* Make a copy to ensure that segmenter_push() isn't actually looking ahead. */ char *copy; if (i > 0 && input[offset + i - 1] == '\n') n_newlines++; copy = xmemdup (input + offset, i); n = segmenter_push (&s, copy, i, &type); free (copy); if (n >= 0) break; } assert (n_newlines <= 2); } else n = segmenter_push (&s, input + offset, length - offset, &type); if (n < 0) error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu", offset); assert (offset + n <= length); if (type == SEG_NEWLINE) assert ((n == 1 && input[offset] == '\n') || (n == 2 && input[offset] == '\r' && input[offset + 1] == '\n')); else assert (memchr (&input[offset], '\n', n) == NULL); if (!print_segments) { offset += n; continue; } if (!verbose) { if (prev_type != SEG_SPACES && prev_type != -1 && type == SEG_SPACES && n == 1 && input[offset] == ' ') { printf (" space\n"); offset++; prev_type = -1; continue; } } if (prev_type != -1) putchar ('\n'); prev_type = type; if (verbose) printf ("%2zu:%2zu: ", line_number, offset - line_offset); type_name = segment_type_to_string (type); for (p = type_name; *p != '\0'; p++) putchar (tolower ((unsigned char) *p)); if (n > 0) { int i; for (i = MIN (15, strlen (type_name)); i < 16; i++) putchar (' '); for (i = 0; i < n; ) { const uint8_t *u_input = CHAR_CAST (const uint8_t *, input); ucs4_t uc; int mblen; mblen = u8_mbtoucr (&uc, u_input + (offset + i), n - i); if (mblen < 0) { int j; mblen = u8_mbtouc (&uc, u_input + (offset + i), n - i); putchar ('<'); for (j = 0; j < mblen; j++) { if (j > 0) putchar (' '); printf ("%02x", input[offset + i + j]); } putchar ('>'); } else { switch (uc) { case ' ': printf ("_"); break; case '_': printf ("\\_"); break; case '\\': printf ("\\\\"); break; case '\t': printf ("\\t"); break; case '\r': printf ("\\r"); break; case '\n': printf ("\\n"); break; case '\v': printf ("\\v"); break; default: if (uc < 0x20 || uc == 0x00a0) printf ("<U+%04X>", uc); else fwrite (input + offset + i, 1, mblen, stdout); break; } } i += mblen; } } offset += n; if (type == SEG_NEWLINE) { enum prompt_style prompt; line_number++; line_offset = offset; prompt = segmenter_get_prompt (&s); printf (" (%s)\n", prompt_style_to_string (prompt)); } }
int main () { ucs4_t uc; int ret; /* Test NUL unit input. */ { static const uint8_t input[] = ""; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 1); ASSERT (ret == 1); ASSERT (uc == 0); } /* Test ISO 646 unit input. */ { ucs4_t c; uint8_t buf[1]; for (c = 0; c < 0x80; c++) { buf[0] = c; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, buf, 1); ASSERT (ret == 1); ASSERT (uc == c); } } /* Test 2-byte character input. */ { static const uint8_t input[] = { 0xC3, 0x97 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == 2); ASSERT (uc == 0x00D7); } /* Test 3-byte character input. */ { static const uint8_t input[] = { 0xE2, 0x82, 0xAC }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 3); ASSERT (ret == 3); ASSERT (uc == 0x20AC); } /* Test 4-byte character input. */ { static const uint8_t input[] = { 0xF4, 0x8F, 0xBF, 0xBD }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 4); ASSERT (ret == 4); ASSERT (uc == 0x10FFFD); } /* Test incomplete/invalid 1-byte input. */ { static const uint8_t input[] = { 0xC1 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 1); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xC3 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 1); ASSERT (ret == -2); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xE2 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 1); ASSERT (ret == -2); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF4 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 1); ASSERT (ret == -2); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xFE }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 1); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } /* Test incomplete/invalid 2-byte input. */ { static const uint8_t input[] = { 0xE0, 0x9F }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xE2, 0x82 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == -2); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xE2, 0xD0 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF0, 0x8F }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF3, 0x8F }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == -2); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF3, 0xD0 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 2); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } /* Test incomplete/invalid 3-byte input. */ { static const uint8_t input[] = { 0xF3, 0x8F, 0xBF }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 3); ASSERT (ret == -2); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF3, 0xD0, 0xBF }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 3); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF3, 0x8F, 0xD0 }; uc = 0xBADFACE; ret = u8_mbtoucr (&uc, input, 3); ASSERT (ret == -1); ASSERT (uc == 0xFFFD); } return 0; }
/* utf8conv_carefully is like iconv, except that - it converts from UTF-8 to UTF-8, - it stops as soon as it encounters a conversion error, and it returns in *INCREMENTED a boolean telling whether it has incremented the input pointers past the error location, - if one_character_only is true, it stops after converting one character. */ static size_t utf8conv_carefully (bool one_character_only, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, bool *incremented) { const char *inptr = *inbuf; size_t insize = *inbytesleft; char *outptr = *outbuf; size_t outsize = *outbytesleft; size_t res; res = 0; do { ucs4_t uc; int n; int m; n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize); if (n < 0) { errno = (n == -2 ? EINVAL : EILSEQ); n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize); inptr += n; insize -= n; res = (size_t)(-1); *incremented = true; break; } if (outsize == 0) { errno = E2BIG; res = (size_t)(-1); *incremented = false; break; } m = u8_uctomb ((uint8_t *) outptr, uc, outsize); if (m == -2) { errno = E2BIG; res = (size_t)(-1); *incremented = false; break; } inptr += n; insize -= n; if (m == -1) { errno = EILSEQ; res = (size_t)(-1); *incremented = true; break; } outptr += m; outsize -= m; } while (!one_character_only && insize > 0); *inbuf = inptr; *inbytesleft = insize; *outbuf = outptr; *outbytesleft = outsize; return res; }
DST_UNIT * FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) { const SRC_UNIT *s_end = s + n; /* Output string accumulator. */ DST_UNIT *result; size_t allocated; size_t length; if (resultbuf != NULL) { result = resultbuf; allocated = *lengthp; } else { result = NULL; allocated = 0; } length = 0; /* Invariants: result is either == resultbuf or == NULL or malloc-allocated. If length > 0, then result != NULL. */ while (s < s_end) { ucs4_t uc; int count; /* Fetch a Unicode character from the input string. */ count = u8_mbtoucr (&uc, s, s_end - s); if (count < 0) { if (!(result == resultbuf || result == NULL)) free (result); errno = EILSEQ; return NULL; } s += count; /* Store it in the output string. */ if (length + 1 > allocated) { DST_UNIT *memory; allocated = (allocated > 0 ? 2 * allocated : 12); if (length + 1 > allocated) allocated = length + 1; if (result == resultbuf || result == NULL) memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); else memory = (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); if (memory == NULL) { if (!(result == resultbuf || result == NULL)) free (result); errno = ENOMEM; return NULL; } if (result == resultbuf && length > 0) memcpy ((char *) memory, (char *) result, length * sizeof (DST_UNIT)); result = memory; } result[length++] = uc; } if (length == 0) { if (result == NULL) { /* Return a non-NULL value. NULL means error. */ result = (DST_UNIT *) malloc (1); if (result == NULL) { errno = ENOMEM; return NULL; } } } else if (result != resultbuf && length < allocated) { /* Shrink the allocated memory if possible. */ DST_UNIT *memory; memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); if (memory != NULL) result = memory; } *lengthp = length; return result; }