const uint8_t * u8_grapheme_next (const uint8_t *s, const uint8_t *end) { ucs4_t prev; int mblen; if (s == end) return NULL; for (s += u8_mbtouc (&prev, s, end - s); s != end; s += mblen) { ucs4_t next; mblen = u8_mbtouc (&next, s, end - s); if (uc_is_grapheme_break (prev, next)) break; prev = next; } return s; }
void u8_grapheme_breaks (const uint8_t *s, size_t n, char *p) { ucs4_t prev; int mblen; prev = 0; for (; n > 0; s += mblen, p += mblen, n -= mblen) { ucs4_t next; int i; mblen = u8_mbtouc (&next, s, n); p[0] = uc_is_grapheme_break (prev, next); for (i = 1; i < mblen; i++) p[i] = 0; prev = next; } }
size_t u8_mbsnlen (const uint8_t *s, size_t n) { size_t characters; characters = 0; while (n > 0) { ucs4_t uc; int count = u8_mbtoucr (&uc, s, n); characters++; if (count == -2) break; if (count < 0) count = u8_mbtouc (&uc, s, n); else if (count == 0) count = 1; s += count; n -= count; } return characters; }
static void html_ostream::write_mem (html_ostream_t stream, const void *data, size_t len) { if (len > 0) { #define BUFFERSIZE 2048 char inbuffer[BUFFERSIZE]; size_t inbufcount; inbufcount = stream->buflen; if (inbufcount > 0) memcpy (inbuffer, stream->buf, inbufcount); for (;;) { /* At this point, inbuffer[0..inbufcount-1] is filled. */ { /* Combine the previous rest with a chunk of new input. */ size_t n = (len <= BUFFERSIZE - inbufcount ? len : BUFFERSIZE - inbufcount); if (n > 0) { memcpy (inbuffer + inbufcount, data, n); data = (char *) data + n; inbufcount += n; len -= n; } } { /* Handle complete UTF-8 characters. */ const char *inptr = inbuffer; size_t insize = inbufcount; while (insize > 0) { unsigned char c0; ucs4_t uc; int nbytes; c0 = ((const unsigned char *) inptr)[0]; if (insize < (c0 < 0xc0 ? 1 : c0 < 0xe0 ? 2 : c0 < 0xf0 ? 3 : c0 < 0xf8 ? 4 : c0 < 0xfc ? 5 : 6)) break; nbytes = u8_mbtouc (&uc, (const unsigned char *) inptr, insize); if (uc == '\n') { size_t prev_class_stack_size = stream->curr_class_stack_size; stream->curr_class_stack_size = 0; emit_pending_spans (stream, false); ostream_write_str (stream->destination, "<br/>"); stream->curr_class_stack_size = prev_class_stack_size; } else { emit_pending_spans (stream, true); switch (uc) { case '"': ostream_write_str (stream->destination, """); break; case '&': ostream_write_str (stream->destination, "&"); break; case '<': ostream_write_str (stream->destination, "<"); break; case '>': /* Needed to avoid "]]>" in the output. */ ostream_write_str (stream->destination, ">"); break; case ' ': /* Needed because HTML viewers merge adjacent spaces and drop spaces adjacent to <br> and similar. */ ostream_write_str (stream->destination, " "); break; default: if (uc >= 0x20 && uc < 0x7F) { /* Output ASCII characters as such. */ char bytes[1]; bytes[0] = uc; ostream_write_mem (stream->destination, bytes, 1); } else { /* Output non-ASCII characters in #&nnn; notation. */ char bytes[32]; sprintf (bytes, "&#%d;", (int) uc); ostream_write_str (stream->destination, bytes); } break; } } inptr += nbytes; insize -= nbytes; } /* Put back the unconverted part. */ if (insize > BUFSIZE) abort (); if (len == 0) { if (insize > 0) memcpy (stream->buf, inptr, insize); stream->buflen = insize; break; } if (insize > 0) memmove (inbuffer, inptr, insize); inbufcount = insize; } } #undef BUFFERSIZE } }
static void check_segmentation (const char *input, size_t length, bool print_segments) { size_t offset, line_number, line_offset; struct segmenter s; int prev_type; segmenter_init (&s, mode); line_number = 1; line_offset = 0; prev_type = -1; for (offset = 0; offset < length; ) { enum segment_type type; const char *type_name, *p; int n; if (one_byte) { int n_newlines = 0; int i; for (i = 0; i <= length - offset; i++) { /* Make a copy to ensure that segmenter_push() isn't actually looking ahead. */ char *copy; if (i > 0 && input[offset + i - 1] == '\n') n_newlines++; copy = xmemdup (input + offset, i); n = segmenter_push (&s, copy, i, &type); free (copy); if (n >= 0) break; } assert (n_newlines <= 2); } else n = segmenter_push (&s, input + offset, length - offset, &type); if (n < 0) error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu", offset); assert (offset + n <= length); if (type == SEG_NEWLINE) assert ((n == 1 && input[offset] == '\n') || (n == 2 && input[offset] == '\r' && input[offset + 1] == '\n')); else assert (memchr (&input[offset], '\n', n) == NULL); if (!print_segments) { offset += n; continue; } if (!verbose) { if (prev_type != SEG_SPACES && prev_type != -1 && type == SEG_SPACES && n == 1 && input[offset] == ' ') { printf (" space\n"); offset++; prev_type = -1; continue; } } if (prev_type != -1) putchar ('\n'); prev_type = type; if (verbose) printf ("%2zu:%2zu: ", line_number, offset - line_offset); type_name = segment_type_to_string (type); for (p = type_name; *p != '\0'; p++) putchar (tolower ((unsigned char) *p)); if (n > 0) { int i; for (i = MIN (15, strlen (type_name)); i < 16; i++) putchar (' '); for (i = 0; i < n; ) { const uint8_t *u_input = CHAR_CAST (const uint8_t *, input); ucs4_t uc; int mblen; mblen = u8_mbtoucr (&uc, u_input + (offset + i), n - i); if (mblen < 0) { int j; mblen = u8_mbtouc (&uc, u_input + (offset + i), n - i); putchar ('<'); for (j = 0; j < mblen; j++) { if (j > 0) putchar (' '); printf ("%02x", input[offset + i + j]); } putchar ('>'); } else { switch (uc) { case ' ': printf ("_"); break; case '_': printf ("\\_"); break; case '\\': printf ("\\\\"); break; case '\t': printf ("\\t"); break; case '\r': printf ("\\r"); break; case '\n': printf ("\\n"); break; case '\v': printf ("\\v"); break; default: if (uc < 0x20 || uc == 0x00a0) printf ("<U+%04X>", uc); else fwrite (input + offset + i, 1, mblen, stdout); break; } } i += mblen; } } offset += n; if (type == SEG_NEWLINE) { enum prompt_style prompt; line_number++; line_offset = offset; prompt = segmenter_get_prompt (&s); printf (" (%s)\n", prompt_style_to_string (prompt)); } }
/* utf8conv_carefully is like iconv, except that - it converts from UTF-8 to UTF-8, - it stops as soon as it encounters a conversion error, and it returns in *INCREMENTED a boolean telling whether it has incremented the input pointers past the error location, - if one_character_only is true, it stops after converting one character. */ static size_t utf8conv_carefully (bool one_character_only, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, bool *incremented) { const char *inptr = *inbuf; size_t insize = *inbytesleft; char *outptr = *outbuf; size_t outsize = *outbytesleft; size_t res; res = 0; do { ucs4_t uc; int n; int m; n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize); if (n < 0) { errno = (n == -2 ? EINVAL : EILSEQ); n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize); inptr += n; insize -= n; res = (size_t)(-1); *incremented = true; break; } if (outsize == 0) { errno = E2BIG; res = (size_t)(-1); *incremented = false; break; } m = u8_uctomb ((uint8_t *) outptr, uc, outsize); if (m == -2) { errno = E2BIG; res = (size_t)(-1); *incremented = false; break; } inptr += n; insize -= n; if (m == -1) { errno = EILSEQ; res = (size_t)(-1); *incremented = true; break; } outptr += m; outsize -= m; } while (!one_character_only && insize > 0); *inbuf = inptr; *inbytesleft = insize; *outbuf = outptr; *outbytesleft = outsize; return res; }
/* Convert a resource name to a class name. Return a nonempty string consisting of alphanumerics and underscores and starting with a letter or underscore. */ static char * construct_class_name (const char *resource_name) { /* This code must be kept consistent with intl.cs, function GettextResourceManager.ConstructClassName. */ /* We could just return an arbitrary fixed class name, like "Messages", assuming that every assembly will only ever contain one GettextResourceSet subclass, but this assumption would break the day we want to support multi-domain PO files in the same format... */ bool valid; const char *p; /* Test for a valid ASCII identifier: - nonempty, - first character is A..Za..z_ - see x-csharp.c:is_identifier_start. - next characters are A..Za..z_0..9 - see x-csharp.c:is_identifier_part. */ valid = (resource_name[0] != '\0'); for (p = resource_name; valid && *p != '\0'; p++) { char c = *p; if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c == '_') || (p > resource_name && c >= '0' && c <= '9'))) valid = false; } if (valid) return xstrdup (resource_name); else { static const char hexdigit[] = "0123456789abcdef"; const char *str = resource_name; const char *str_limit = str + strlen (str); char *class_name = XNMALLOC (12 + 6 * (str_limit - str) + 1, char); char *b; b = class_name; memcpy (b, "__UESCAPED__", 12); b += 12; while (str < str_limit) { ucs4_t uc; str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); if (uc >= 0x10000) { *b++ = '_'; *b++ = 'U'; *b++ = hexdigit[(uc >> 28) & 0x0f]; *b++ = hexdigit[(uc >> 24) & 0x0f]; *b++ = hexdigit[(uc >> 20) & 0x0f]; *b++ = hexdigit[(uc >> 16) & 0x0f]; *b++ = hexdigit[(uc >> 12) & 0x0f]; *b++ = hexdigit[(uc >> 8) & 0x0f]; *b++ = hexdigit[(uc >> 4) & 0x0f]; *b++ = hexdigit[uc & 0x0f]; } else if (!((uc >= 'A' && uc <= 'Z') || (uc >= 'a' && uc <= 'z') || (uc >= '0' && uc <= '9'))) { *b++ = '_'; *b++ = 'u'; *b++ = hexdigit[(uc >> 12) & 0x0f]; *b++ = hexdigit[(uc >> 8) & 0x0f]; *b++ = hexdigit[(uc >> 4) & 0x0f]; *b++ = hexdigit[uc & 0x0f]; }
/* This function works in a similar way to 'forward-sentence' in Emacs, which basically does a regular expression matching of: [.?!\u2026] []"'\u201d)}]* \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\) Since we are lacking a regular expression routine capable of Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent version, we would rather avoid depending on it), apply a manually constructed DFA, which consists of 8 states where 4 of them are a terminal. */ const char * sentence_end (const char *string, ucs4_t *ending_charp) { const char *str = string; const char *str_limit = string + strlen (str); /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal. */ int state = 0; /* Previous character before an end marker. */ ucs4_t ending_char = 0xfffd; /* Possible starting position of the match, and the next starting position if the current match fails. */ const char *match_start, *match_next; /* Number of spaces. */ int spaces; while (str <= str_limit) { ucs4_t uc; size_t length; length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); if (state == 0) { switch (uc) { case '.': case '?': case '!': case 0x2026: state = 1; match_start = str; match_next = str + length; ending_char = uc; spaces = 0; break; default: break; } str += length; continue; } if (state == 1) { switch (uc) { case ']': case '"': case '\'': case ')': case '}': case 0x201d: state = 2; break; case '\0': case '\n': /* State 3. */ *ending_charp = ending_char; return match_start; case ' ': case 0x00a0: if (++spaces == sentence_end_required_spaces) { /* State 7. */ *ending_charp = ending_char; return match_start; } state = 4; break; case '\t': /* State 5. */ *ending_charp = ending_char; return match_start; default: str = match_next; state = 0; continue; } str += length; continue; } if (state == 2) { switch (uc) { case ']': case '"': case '\'': case ')': case '}': case 0x201d: break; case '\0': case '\n': /* State 3. */ *ending_charp = ending_char; return match_start; case ' ': case 0x00a0: if (++spaces == sentence_end_required_spaces) { /* State 7. */ *ending_charp = ending_char; return match_start; } state = 4; break; case '\t': /* State 5. */ *ending_charp = ending_char; return match_start; default: state = 0; str = match_next; continue; } str += length; continue; } if (state == 4) { switch (uc) { case '\0': case '\n': /* State 6. */ *ending_charp = ending_char; return match_start; case ' ': case 0x00a0: if (++spaces == sentence_end_required_spaces) { /* State 7. */ *ending_charp = ending_char; return match_start; } break; default: state = 0; str = match_next; continue; } str += length; continue; } } *ending_charp = 0xfffd; return str_limit; }
/* Return true if INPUT is an XML reference. */ static bool is_reference (const char *input) { const char *str = input; const char *str_limit = str + strlen (input); ucs4_t uc; int i; str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); assert (uc == '&'); str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); /* CharRef */ if (uc == '#') { str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); if (uc == 'x') { while (str < str_limit) { str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); if (!(('0' <= uc && uc <= '9') || ('A' <= uc && uc <= 'F') || ('a' <= uc && uc <= 'f'))) break; } return uc == ';'; } else if ('0' <= uc && uc <= '9') { while (str < str_limit) { str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); if (!('0' <= uc && uc <= '9')) break; } return uc == ';'; } } else { /* EntityRef */ for (i = 0; i < SIZEOF (name_chars1); i++) if (name_chars1[i].start <= uc && uc <= name_chars1[i].end) break; if (i == SIZEOF (name_chars1)) return false; while (str < str_limit) { str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); for (i = 0; i < SIZEOF (name_chars1); i++) if (name_chars1[i].start <= uc && uc <= name_chars1[i].end) break; if (i == SIZEOF (name_chars1)) { for (i = 0; i < SIZEOF (name_chars2); i++) if (name_chars2[i].start <= uc && uc <= name_chars2[i].end) break; if (i == SIZEOF (name_chars2)) return false; } } return uc == ';'; } return false; }