/* A tricky optimization, but probably worth it. */ unsigned long scm_i_utf8_string_hash (const char *str, size_t len) { const scm_t_uint8 *end, *ustr = (const scm_t_uint8 *) str; unsigned long ret; /* The length of the string in characters. This name corresponds to Jenkins' original name. */ size_t length; scm_t_uint32 a, b, c, u32; if (len == (size_t) -1) len = strlen (str); end = ustr + len; if (u8_check (ustr, len) != NULL) /* Invalid UTF-8; punt. */ return scm_i_string_hash (scm_from_utf8_stringn (str, len)); length = u8_strnlen (ustr, len); /* Set up the internal state. */ a = b = c = 0xdeadbeef + ((scm_t_uint32)(length<<2)) + 47; /* Handle most of the key. */ while (length > 3) { ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); a += u32; ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); b += u32; ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); c += u32; mix (a, b, c); length -= 3; } /* Handle the last 3 elements's. */ ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); a += u32; if (--length) { ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); b += u32; if (--length) { ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); c += u32; } } final (a, b, c);
int u8_width (const uint8_t *s, size_t n, const char *encoding) { const uint8_t *s_end = s + n; int width = 0; while (s < s_end) { ucs4_t uc; int w; s += u8_mbtouc_unsafe (&uc, s, s_end - s); if (uc == 0) break; /* end of string reached */ w = uc_width (uc, encoding); if (w >= 0) /* ignore control characters in the string */ width += w; } return width; }
static int mem_cd_iconveh_internal (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, size_t extra_alloc, size_t *offsets, char **resultp, size_t *lengthp) { /* When a conversion error occurs, we cannot start using CD1 and CD2 at this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR. Instead, we have to start afresh from the beginning of SRC. */ /* Use a temporary buffer, so that for small strings, a single malloc() call will be sufficient. */ # define tmpbufsize 4096 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or libiconv's UCS-4-INTERNAL encoding. */ union { unsigned int align; char buf[tmpbufsize]; } tmp; # define tmpbuf tmp.buf char *initial_result; char *result; size_t allocated; size_t length; size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */ if (*resultp != NULL && *lengthp >= sizeof (tmpbuf)) { initial_result = *resultp; allocated = *lengthp; } else { initial_result = tmpbuf; allocated = sizeof (tmpbuf); } result = initial_result; /* Test whether a direct conversion is possible at all. */ if (cd == (iconv_t)(-1)) goto indirectly; if (offsets != NULL) { size_t i; for (i = 0; i < srclen; i++) offsets[i] = (size_t)(-1); last_length = (size_t)(-1); } length = 0; /* First, try a direct conversion, and see whether a conversion error occurs at all. */ { const char *inptr = src; size_t insize = srclen; /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ # if defined _LIBICONV_VERSION \ || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ || defined __sun) /* Set to the initial state. */ iconv (cd, NULL, NULL, NULL, NULL); # endif while (insize > 0) { char *outptr = result + length; size_t outsize = allocated - extra_alloc - length; bool incremented; size_t res; bool grow; if (offsets != NULL) { if (length != last_length) /* ensure that offset[] be increasing */ { offsets[inptr - src] = length; last_length = length; } res = iconv_carefully_1 (cd, &inptr, &insize, &outptr, &outsize, &incremented); } else /* Use iconv_carefully instead of iconv here, because: - If TO_CODESET is UTF-8, we can do the error handling in this loop, no need for a second loop, - With iconv() implementations other than GNU libiconv and GNU libc, if we use iconv() in a big swoop, checking for an E2BIG return, we lose the number of irreversible conversions. */ res = iconv_carefully (cd, &inptr, &insize, &outptr, &outsize, &incremented); length = outptr - result; grow = (length + extra_alloc > allocated / 2); if (res == (size_t)(-1)) { if (errno == E2BIG) grow = true; else if (errno == EINVAL) break; else if (errno == EILSEQ && handler != iconveh_error) { if (cd2 == (iconv_t)(-1)) { /* TO_CODESET is UTF-8. */ /* Error handling can produce up to 1 byte of output. */ if (length + 1 + extra_alloc > allocated) { char *memory; allocated = 2 * allocated; if (length + 1 + extra_alloc > allocated) abort (); if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { if (result != initial_result) free (result); errno = ENOMEM; return -1; } if (result == initial_result) memcpy (memory, initial_result, length); result = memory; grow = false; } /* The input is invalid in FROM_CODESET. Eat up one byte and emit a question mark. */ if (!incremented) { if (insize == 0) abort (); inptr++; insize--; } result[length] = '?'; length++; } else goto indirectly; } else { if (result != initial_result) { int saved_errno = errno; free (result); errno = saved_errno; } return -1; } } if (insize == 0) break; if (grow) { char *memory; allocated = 2 * allocated; if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { if (result != initial_result) free (result); errno = ENOMEM; return -1; } if (result == initial_result) memcpy (memory, initial_result, length); result = memory; } } } /* Now get the conversion state back to the initial state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */ #if defined _LIBICONV_VERSION \ || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ || defined __sun) for (;;) { char *outptr = result + length; size_t outsize = allocated - extra_alloc - length; size_t res; res = iconv (cd, NULL, NULL, &outptr, &outsize); length = outptr - result; if (res == (size_t)(-1)) { if (errno == E2BIG) { char *memory; allocated = 2 * allocated; if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { if (result != initial_result) free (result); errno = ENOMEM; return -1; } if (result == initial_result) memcpy (memory, initial_result, length); result = memory; } else { if (result != initial_result) { int saved_errno = errno; free (result); errno = saved_errno; } return -1; } } else break; } #endif /* The direct conversion succeeded. */ goto done; indirectly: /* The direct conversion failed. Use a conversion through UTF-8. */ if (offsets != NULL) { size_t i; for (i = 0; i < srclen; i++) offsets[i] = (size_t)(-1); last_length = (size_t)(-1); } length = 0; { const bool slowly = (offsets != NULL || handler == iconveh_error); # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ char utf8buf[utf8bufsize + 1]; size_t utf8len = 0; const char *in1ptr = src; size_t in1size = srclen; bool do_final_flush1 = true; bool do_final_flush2 = true; /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ # if defined _LIBICONV_VERSION \ || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ || defined __sun) /* Set to the initial state. */ if (cd1 != (iconv_t)(-1)) iconv (cd1, NULL, NULL, NULL, NULL); if (cd2 != (iconv_t)(-1)) iconv (cd2, NULL, NULL, NULL, NULL); # endif while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2) { char *out1ptr = utf8buf + utf8len; size_t out1size = utf8bufsize - utf8len; bool incremented1; size_t res1; int errno1; /* Conversion step 1: from FROM_CODESET to UTF-8. */ if (in1size > 0) { if (offsets != NULL && length != last_length) /* ensure that offset[] be increasing */ { offsets[in1ptr - src] = length; last_length = length; } if (cd1 != (iconv_t)(-1)) { if (slowly) res1 = iconv_carefully_1 (cd1, &in1ptr, &in1size, &out1ptr, &out1size, &incremented1); else res1 = iconv_carefully (cd1, &in1ptr, &in1size, &out1ptr, &out1size, &incremented1); } else { /* FROM_CODESET is UTF-8. */ res1 = utf8conv_carefully (slowly, &in1ptr, &in1size, &out1ptr, &out1size, &incremented1); } } else if (do_final_flush1) { /* Now get the conversion state of CD1 back to the initial state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */ # if defined _LIBICONV_VERSION \ || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ || defined __sun) if (cd1 != (iconv_t)(-1)) res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size); else # endif res1 = 0; do_final_flush1 = false; incremented1 = true; } else { res1 = 0; incremented1 = true; } if (res1 == (size_t)(-1) && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ)) { if (result != initial_result) { int saved_errno = errno; free (result); errno = saved_errno; } return -1; } if (res1 == (size_t)(-1) && errno == EILSEQ && handler != iconveh_error) { /* The input is invalid in FROM_CODESET. Eat up one byte and emit a question mark. Room for the question mark was allocated at the end of utf8buf. */ if (!incremented1) { if (in1size == 0) abort (); in1ptr++; in1size--; } *out1ptr++ = '?'; res1 = 0; } errno1 = errno; utf8len = out1ptr - utf8buf; if (offsets != NULL || in1size == 0 || utf8len > utf8bufsize / 2 || (res1 == (size_t)(-1) && errno1 == E2BIG)) { /* Conversion step 2: from UTF-8 to TO_CODESET. */ const char *in2ptr = utf8buf; size_t in2size = utf8len; while (in2size > 0 || (in1size == 0 && !do_final_flush1 && do_final_flush2)) { char *out2ptr = result + length; size_t out2size = allocated - extra_alloc - length; bool incremented2; size_t res2; bool grow; if (in2size > 0) { if (cd2 != (iconv_t)(-1)) res2 = iconv_carefully (cd2, &in2ptr, &in2size, &out2ptr, &out2size, &incremented2); else /* TO_CODESET is UTF-8. */ res2 = utf8conv_carefully (false, &in2ptr, &in2size, &out2ptr, &out2size, &incremented2); } else /* in1size == 0 && !do_final_flush1 && in2size == 0 && do_final_flush2 */ { /* Now get the conversion state of CD1 back to the initial state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */ # if defined _LIBICONV_VERSION \ || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ || defined __sun) if (cd2 != (iconv_t)(-1)) res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size); else # endif res2 = 0; do_final_flush2 = false; incremented2 = true; } length = out2ptr - result; grow = (length + extra_alloc > allocated / 2); if (res2 == (size_t)(-1)) { if (errno == E2BIG) grow = true; else if (errno == EINVAL) break; else if (errno == EILSEQ && handler != iconveh_error) { /* Error handling can produce up to 10 bytes of ASCII output. But TO_CODESET may be UCS-2, UTF-16 or UCS-4, so use CD2 here as well. */ char scratchbuf[10]; size_t scratchlen; ucs4_t uc; const char *inptr; size_t insize; size_t res; if (incremented2) { if (u8_prev (&uc, (const uint8_t *) in2ptr, (const uint8_t *) utf8buf) == NULL) abort (); } else { int n; if (in2size == 0) abort (); n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr, in2size); in2ptr += n; in2size -= n; } if (handler == iconveh_escape_sequence) { static char hex[16] = "0123456789ABCDEF"; scratchlen = 0; scratchbuf[scratchlen++] = '\\'; if (uc < 0x10000) scratchbuf[scratchlen++] = 'u'; else { scratchbuf[scratchlen++] = 'U'; scratchbuf[scratchlen++] = hex[(uc>>28) & 15]; scratchbuf[scratchlen++] = hex[(uc>>24) & 15]; scratchbuf[scratchlen++] = hex[(uc>>20) & 15]; scratchbuf[scratchlen++] = hex[(uc>>16) & 15]; } scratchbuf[scratchlen++] = hex[(uc>>12) & 15]; scratchbuf[scratchlen++] = hex[(uc>>8) & 15]; scratchbuf[scratchlen++] = hex[(uc>>4) & 15]; scratchbuf[scratchlen++] = hex[uc & 15]; } else { scratchbuf[0] = '?'; scratchlen = 1; } inptr = scratchbuf; insize = scratchlen; if (cd2 != (iconv_t)(-1)) res = iconv (cd2, (ICONV_CONST char **) &inptr, &insize, &out2ptr, &out2size); else { /* TO_CODESET is UTF-8. */ if (out2size >= insize) { memcpy (out2ptr, inptr, insize); out2ptr += insize; out2size -= insize; inptr += insize; insize = 0; res = 0; } else { errno = E2BIG; res = (size_t)(-1); } } length = out2ptr - result; if (res == (size_t)(-1) && errno == E2BIG) { char *memory; allocated = 2 * allocated; if (length + 1 + extra_alloc > allocated) abort (); if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { if (result != initial_result) free (result); errno = ENOMEM; return -1; } if (result == initial_result) memcpy (memory, initial_result, length); result = memory; grow = false; out2ptr = result + length; out2size = allocated - extra_alloc - length; if (cd2 != (iconv_t)(-1)) res = iconv (cd2, (ICONV_CONST char **) &inptr, &insize, &out2ptr, &out2size); else { /* TO_CODESET is UTF-8. */ if (!(out2size >= insize)) abort (); memcpy (out2ptr, inptr, insize); out2ptr += insize; out2size -= insize; inptr += insize; insize = 0; res = 0; } length = out2ptr - result; } # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__) /* Irix iconv() inserts a NUL byte if it cannot convert. NetBSD iconv() inserts a question mark if it cannot convert. Only GNU libiconv and GNU libc are known to prefer to fail rather than doing a lossy conversion. */ if (res != (size_t)(-1) && res > 0) { errno = EILSEQ; res = (size_t)(-1); } # endif if (res == (size_t)(-1)) { /* Failure converting the ASCII replacement. */ if (result != initial_result) { int saved_errno = errno; free (result); errno = saved_errno; } return -1; } } else { if (result != initial_result) { int saved_errno = errno; free (result); errno = saved_errno; } return -1; } } if (!(in2size > 0 || (in1size == 0 && !do_final_flush1 && do_final_flush2))) break; if (grow) { char *memory; allocated = 2 * allocated; if (result == initial_result) memory = (char *) malloc (allocated); else memory = (char *) realloc (result, allocated); if (memory == NULL) { if (result != initial_result) free (result); errno = ENOMEM; return -1; } if (result == initial_result) memcpy (memory, initial_result, length); result = memory; } } /* Move the remaining bytes to the beginning of utf8buf. */ if (in2size > 0) memmove (utf8buf, in2ptr, in2size); utf8len = in2size; }
void u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *p) { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint8_t *s_end = s + n; int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ /* Don't break inside multibyte characters. */ memset (p, UC_BREAK_PROHIBITED, n); while (s < s_end) { ucs4_t uc; int count = u8_mbtouc_unsafe (&uc, s, s_end - s); int prop = unilbrkprop_lookup (uc); if (prop == LBP_BK) { /* Mandatory break. */ *p = UC_BREAK_MANDATORY; last_prop = LBP_BK; seen_space = NULL; seen_space2 = NULL; } else { char *q; /* Resolve property values whose behaviour is not fixed. */ switch (prop) { case LBP_AI: /* Resolve ambiguous. */ prop = LBP_AI_REPLACEMENT; break; case LBP_CB: /* This is arbitrary. */ prop = LBP_ID; break; case LBP_SA: /* We don't handle complex scripts yet. Treat LBP_SA like LBP_XX. */ case LBP_XX: /* This is arbitrary. */ prop = LBP_AL; break; } /* Deal with spaces and combining characters. */ q = p; if (prop == LBP_SP) { /* Don't break just before a space. */ *p = UC_BREAK_PROHIBITED; seen_space2 = seen_space; seen_space = p; } else if (prop == LBP_ZW) { /* Don't break just before a zero-width space. */ *p = UC_BREAK_PROHIBITED; last_prop = LBP_ZW; seen_space = NULL; seen_space2 = NULL; } else if (prop == LBP_CM) { /* Don't break just before a combining character, except immediately after a zero-width space. */ if (last_prop == LBP_ZW) { /* Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; /* A combining character turns a preceding space into LBP_ID. */ last_prop = LBP_ID; } else { *p = UC_BREAK_PROHIBITED; /* A combining character turns a preceding space into LBP_ID. */ if (seen_space != NULL) { q = seen_space; seen_space = seen_space2; prop = LBP_ID; goto lookup_via_table; } } } else { lookup_via_table: /* prop must be usable as an index for table 7.3 of UTR #14. */ if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) abort (); if (last_prop == LBP_BK) { /* Don't break at the beginning of a line. */ *q = UC_BREAK_PROHIBITED; } else if (last_prop == LBP_ZW) { /* Break after zero-width space. */ *q = UC_BREAK_POSSIBLE; } else { switch (unilbrk_table [last_prop] [prop]) { case D: *q = UC_BREAK_POSSIBLE; break; case I: *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); break; case P: *q = UC_BREAK_PROHIBITED; break; default: abort (); } } last_prop = prop; seen_space = NULL; seen_space2 = NULL; } } s += count; p += count; } }
int u8_width_linebreaks (const uint8_t *s, size_t n, int width, int start_column, int at_end_columns, const char *o, const char *encoding, char *p) { const uint8_t *s_end; char *last_p; int last_column; int piece_width; u8_possible_linebreaks (s, n, encoding, p); s_end = s + n; last_p = NULL; last_column = start_column; piece_width = 0; while (s < s_end) { ucs4_t uc; int count = u8_mbtouc_unsafe (&uc, s, s_end - s); /* Respect the override. */ if (o != NULL && *o != UC_BREAK_UNDEFINED) *p = *o; if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) { /* An atomic piece of text ends here. */ if (last_p != NULL && last_column + piece_width > width) { /* Insert a line break. */ *last_p = UC_BREAK_POSSIBLE; last_column = 0; } } if (*p == UC_BREAK_MANDATORY) { /* uc is a line break character. */ /* Start a new piece at column 0. */ last_p = NULL; last_column = 0; piece_width = 0; } else { /* uc is not a line break character. */ int w; if (*p == UC_BREAK_POSSIBLE) { /* Start a new piece. */ last_p = p; last_column += piece_width; piece_width = 0; /* No line break for the moment, may be turned into UC_BREAK_POSSIBLE later, via last_p. */ } *p = UC_BREAK_PROHIBITED; w = uc_width (uc, encoding); if (w >= 0) /* ignore control characters in the string */ piece_width += w; } s += count; p += count; if (o != NULL) o += count; } /* The last atomic piece of text ends here. */ if (last_p != NULL && last_column + piece_width + at_end_columns > width) { /* Insert a line break. */ *last_p = UC_BREAK_POSSIBLE; last_column = 0; } return last_column + piece_width; }