void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes) { unsigned int initial_pos = str_len(dest); unichar_t chr; size_t i; for (i = 0; i < max_bytes && src[i] != '\0'; ) { int len = uni_utf8_get_char_n(src+i, max_bytes-i, &chr); if (len == 0) break; /* input ended too early */ if (len < 0) { /* invalid UTF-8 */ str_append_c(dest, '?'); i++; continue; } if ((unsigned char)src[i] < 32) str_append_c(dest, '?'); else str_append_n(dest, src+i, len); i += len; } if (src[i] != '\0') { if (max_bytes < 3) str_truncate(dest, initial_pos); else { while (str_len(dest) - initial_pos > max_bytes-3) str_sanitize_truncate_char(dest, initial_pos); } str_append(dest, "..."); } }
void test_unichar(void) { static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1"; static const char collate_in[] = "\xc3\xbc \xc2\xb3"; static const char collate_exp[] = "U\xcc\x88 3"; buffer_t *collate_out; unichar_t chr, chr2; string_t *str = t_str_new(16); test_begin("unichars encode/decode"); for (chr = 0; chr <= 0x10ffff; chr++) { /* The bottom 6 bits should be irrelevant to code coverage, only test 000000, 111111, and something in between. */ if ((chr & 63) == 1) chr += rand() % 62; /* After 0, somewhere between 1 and 62 */ else if ((chr & 63) > 0 && (chr & 63) < 63) chr |= 63; /* After random, straight to 63 */ str_truncate(str, 0); uni_ucs4_to_utf8_c(chr, str); test_assert(uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) == (int)uni_utf8_char_bytes(*str_data(str))); test_assert(chr2 == chr); if ((chr & 0x63) == 0) { unsigned int utf8len = uni_utf8_char_bytes(*str_c(str)); /* virtually truncate the byte string */ while (--utf8len > 0) test_assert(uni_utf8_get_char_n(str_c(str), utf8len, &chr2) == 0); utf8len = uni_utf8_char_bytes(*str_c(str)); /* actually truncate the byte stream */ while (--utf8len > 0) { str_truncate(str, utf8len); test_assert(!uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) == 0); } } } test_end(); test_begin("unichar collation"); collate_out = buffer_create_dynamic(default_pool, 32); uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in), collate_out); test_assert(!strcmp(collate_out->data, collate_exp)); buffer_free(&collate_out); test_assert(!uni_utf8_str_is_valid(overlong_utf8)); test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0); test_end(); test_unichar_uni_utf8_strlen(); test_unichar_uni_utf8_partial_strlen_n(); }
static void xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len) { unichar_t chr; unsigned int i; for (i = 0; i < len; i++) { switch (data[i]) { case '&': str_append(dest, "&"); break; case '<': str_append(dest, "<"); break; case '>': str_append(dest, ">"); break; case '\t': case '\n': case '\r': /* exceptions to the following control char check */ str_append_c(dest, data[i]); break; default: if (data[i] < 32) { /* SOLR doesn't like control characters. replace them with spaces. */ str_append_c(dest, ' '); } else if (data[i] >= 0x80) { /* make sure the character is valid for XML so we don't get XML parser errors */ unsigned int char_len = uni_utf8_char_bytes(data[i]); if (i + char_len <= len && uni_utf8_get_char_n(data + i, char_len, &chr) == 1 && is_valid_xml_char(chr)) str_append_n(dest, data + i, char_len); else { str_append_n(dest, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN); } i += char_len - 1; } else { str_append_c(dest, data[i]); } break; } } }
static size_t str_sanitize_skip_start(const char *src, size_t max_bytes) { unichar_t chr; size_t i; for (i = 0; i < max_bytes && src[i] != '\0'; ) { int len = uni_utf8_get_char_n(src+i, max_bytes-i, &chr); if (len <= 0) break; if ((unsigned char)src[i] < 32) break; i += len; } i_assert(i <= max_bytes); return i; }
int uni_utf8_get_char(const char *input, unichar_t *chr_r) { return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1, chr_r); }