void test_unichar(void) { static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1"; static const char collate_in[] = "\xc3\xbc \xc2\xb3"; static const char collate_exp[] = "U\xcc\x88 3"; buffer_t *collate_out; unichar_t chr, chr2; string_t *str = t_str_new(16); test_begin("unichars"); for (chr = 0; chr <= 0x10ffff; chr++) { str_truncate(str, 0); uni_ucs4_to_utf8_c(chr, str); test_assert(uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) > 0); test_assert(chr2 == chr); } collate_out = buffer_create_dynamic(default_pool, 32); uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in), collate_out); test_assert(!strcmp(collate_out->data, collate_exp)); buffer_free(&collate_out); test_assert(!uni_utf8_str_is_valid(overlong_utf8)); test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0); test_end(); test_unichar_uni_utf8_strlen(); test_unichar_uni_utf8_partial_strlen_n(); }
void test_unichar(void) { static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1"; static const char collate_in[] = "\xc3\xbc \xc2\xb3"; static const char collate_exp[] = "U\xcc\x88 3"; buffer_t *collate_out; unichar_t chr, chr2; string_t *str = t_str_new(16); test_begin("unichars encode/decode"); for (chr = 0; chr <= 0x10ffff; chr++) { /* The bottom 6 bits should be irrelevant to code coverage, only test 000000, 111111, and something in between. */ if ((chr & 63) == 1) chr += rand() % 62; /* After 0, somewhere between 1 and 62 */ else if ((chr & 63) > 0 && (chr & 63) < 63) chr |= 63; /* After random, straight to 63 */ str_truncate(str, 0); uni_ucs4_to_utf8_c(chr, str); test_assert(uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) == (int)uni_utf8_char_bytes(*str_data(str))); test_assert(chr2 == chr); if ((chr & 0x63) == 0) { unsigned int utf8len = uni_utf8_char_bytes(*str_c(str)); /* virtually truncate the byte string */ while (--utf8len > 0) test_assert(uni_utf8_get_char_n(str_c(str), utf8len, &chr2) == 0); utf8len = uni_utf8_char_bytes(*str_c(str)); /* actually truncate the byte stream */ while (--utf8len > 0) { str_truncate(str, utf8len); test_assert(!uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) == 0); } } } test_end(); test_begin("unichar collation"); collate_out = buffer_create_dynamic(default_pool, 32); uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in), collate_out); test_assert(!strcmp(collate_out->data, collate_exp)); buffer_free(&collate_out); test_assert(!uni_utf8_str_is_valid(overlong_utf8)); test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0); test_end(); test_unichar_uni_utf8_strlen(); test_unichar_uni_utf8_partial_strlen_n(); }
void str_sanitize_append(string_t *dest, const char *src, size_t max_len) { unsigned int len; unichar_t chr; size_t i; int ret; for (i = 0; i < max_len && src[i] != '\0'; ) { len = uni_utf8_char_bytes(src[i]); ret = uni_utf8_get_char(src+i, &chr); if (ret <= 0) { /* invalid UTF-8 */ str_append_c(dest, '?'); if (ret == 0) { /* input ended too early */ return; } i++; continue; } if ((unsigned char)src[i] < 32) str_append_c(dest, '?'); else str_append_c(dest, src[i]); i += len; } if (src[i] != '\0') { str_truncate(dest, str_len(dest) <= 3 ? 0 : str_len(dest)-3); str_append(dest, "..."); } }
static unichar_t get_ending_utf8_char(const char *str, size_t *end_pos) { unichar_t c; while (!UTF8_IS_START_SEQ(str[*end_pos])) { i_assert(*end_pos > 0); *end_pos -= 1; } if (uni_utf8_get_char(str + *end_pos, &c) <= 0) i_unreached(); return c; }
static size_t str_sanitize_skip_start(const char *src, size_t max_len) { unsigned int len; unichar_t chr; size_t i; for (i = 0; i < max_len; ) { len = uni_utf8_char_bytes(src[i]); if (uni_utf8_get_char(src+i, &chr) <= 0) break; if ((unsigned char)src[i] < 32) break; i += len; } return i; }
static size_t str_sanitize_skip_start(const char *src, size_t max_bytes) { unsigned int len; unichar_t chr; size_t i; for (i = 0; src[i] != '\0'; ) { len = uni_utf8_char_bytes(src[i]); if (i + len > max_bytes || uni_utf8_get_char(src+i, &chr) <= 0) break; if ((unsigned char)src[i] < 32) break; i += len; } i_assert(i <= max_bytes); return i; }
void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes) { unsigned int len, initial_pos = str_len(dest); unichar_t chr; size_t i; int ret; for (i = 0; src[i] != '\0'; ) { len = uni_utf8_char_bytes(src[i]); if (i + len > max_bytes) break; ret = uni_utf8_get_char(src+i, &chr); if (ret <= 0) { /* invalid UTF-8 */ str_append_c(dest, '?'); if (ret == 0) { /* input ended too early */ return; } i++; continue; } if ((unsigned char)src[i] < 32) str_append_c(dest, '?'); else str_append_n(dest, src+i, len); i += len; } if (src[i] != '\0') { if (max_bytes < 3) str_truncate(dest, initial_pos); else { while (str_len(dest) - initial_pos > max_bytes-3) str_sanitize_truncate_char(dest, initial_pos); } str_append(dest, "..."); } }