void test_unichar(void) { static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1"; static const char collate_in[] = "\xc3\xbc \xc2\xb3"; static const char collate_exp[] = "U\xcc\x88 3"; buffer_t *collate_out; unichar_t chr, chr2; string_t *str = t_str_new(16); test_begin("unichars encode/decode"); for (chr = 0; chr <= 0x10ffff; chr++) { /* The bottom 6 bits should be irrelevant to code coverage, only test 000000, 111111, and something in between. */ if ((chr & 63) == 1) chr += rand() % 62; /* After 0, somewhere between 1 and 62 */ else if ((chr & 63) > 0 && (chr & 63) < 63) chr |= 63; /* After random, straight to 63 */ str_truncate(str, 0); uni_ucs4_to_utf8_c(chr, str); test_assert(uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) == (int)uni_utf8_char_bytes(*str_data(str))); test_assert(chr2 == chr); if ((chr & 0x63) == 0) { unsigned int utf8len = uni_utf8_char_bytes(*str_c(str)); /* virtually truncate the byte string */ while (--utf8len > 0) test_assert(uni_utf8_get_char_n(str_c(str), utf8len, &chr2) == 0); utf8len = uni_utf8_char_bytes(*str_c(str)); /* actually truncate the byte stream */ while (--utf8len > 0) { str_truncate(str, utf8len); test_assert(!uni_utf8_str_is_valid(str_c(str))); test_assert(uni_utf8_get_char(str_c(str), &chr2) == 0); } } } test_end(); test_begin("unichar collation"); collate_out = buffer_create_dynamic(default_pool, 32); uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in), collate_out); test_assert(!strcmp(collate_out->data, collate_exp)); buffer_free(&collate_out); test_assert(!uni_utf8_str_is_valid(overlong_utf8)); test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0); test_end(); test_unichar_uni_utf8_strlen(); test_unichar_uni_utf8_partial_strlen_n(); }
void str_sanitize_append(string_t *dest, const char *src, size_t max_len) { unsigned int len; unichar_t chr; size_t i; int ret; for (i = 0; i < max_len && src[i] != '\0'; ) { len = uni_utf8_char_bytes(src[i]); ret = uni_utf8_get_char(src+i, &chr); if (ret <= 0) { /* invalid UTF-8 */ str_append_c(dest, '?'); if (ret == 0) { /* input ended too early */ return; } i++; continue; } if ((unsigned char)src[i] < 32) str_append_c(dest, '?'); else str_append_c(dest, src[i]); i += len; } if (src[i] != '\0') { str_truncate(dest, str_len(dest) <= 3 ? 0 : str_len(dest)-3); str_append(dest, "..."); } }
static void xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len) { unichar_t chr; unsigned int i; for (i = 0; i < len; i++) { switch (data[i]) { case '&': str_append(dest, "&"); break; case '<': str_append(dest, "<"); break; case '>': str_append(dest, ">"); break; case '\t': case '\n': case '\r': /* exceptions to the following control char check */ str_append_c(dest, data[i]); break; default: if (data[i] < 32) { /* SOLR doesn't like control characters. replace them with spaces. */ str_append_c(dest, ' '); } else if (data[i] >= 0x80) { /* make sure the character is valid for XML so we don't get XML parser errors */ unsigned int char_len = uni_utf8_char_bytes(data[i]); if (i + char_len <= len && uni_utf8_get_char_n(data + i, char_len, &chr) == 1 && is_valid_xml_char(chr)) str_append_n(dest, data + i, char_len); else { str_append_n(dest, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN); } i += char_len - 1; } else { str_append_c(dest, data[i]); } break; } } }
static size_t str_sanitize_skip_start(const char *src, size_t max_len) { unsigned int len; unichar_t chr; size_t i; for (i = 0; i < max_len; ) { len = uni_utf8_char_bytes(src[i]); if (uni_utf8_get_char(src+i, &chr) <= 0) break; if ((unsigned char)src[i] < 32) break; i += len; } return i; }
static size_t str_sanitize_skip_start(const char *src, size_t max_bytes) { unsigned int len; unichar_t chr; size_t i; for (i = 0; src[i] != '\0'; ) { len = uni_utf8_char_bytes(src[i]); if (i + len > max_bytes || uni_utf8_get_char(src+i, &chr) <= 0) break; if ((unsigned char)src[i] < 32) break; i += len; } i_assert(i <= max_bytes); return i; }
void fts_tokenizer_delete_trailing_partial_char(const unsigned char *data, size_t *len) { size_t pos; unsigned int char_bytes; /* the token is truncated - make sure the last character exists entirely in the token */ for (pos = *len-1; pos > 0; pos--) { if (UTF8_IS_START_SEQ(data[pos])) break; } char_bytes = uni_utf8_char_bytes(data[pos]); if (char_bytes != *len-pos) { i_assert(char_bytes > *len-pos); *len = pos; } }
void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes) { unsigned int len, initial_pos = str_len(dest); unichar_t chr; size_t i; int ret; for (i = 0; src[i] != '\0'; ) { len = uni_utf8_char_bytes(src[i]); if (i + len > max_bytes) break; ret = uni_utf8_get_char(src+i, &chr); if (ret <= 0) { /* invalid UTF-8 */ str_append_c(dest, '?'); if (ret == 0) { /* input ended too early */ return; } i++; continue; } if ((unsigned char)src[i] < 32) str_append_c(dest, '?'); else str_append_n(dest, src+i, len); i += len; } if (src[i] != '\0') { if (max_bytes < 3) str_truncate(dest, initial_pos); else { while (str_len(dest) - initial_pos > max_bytes-3) str_sanitize_truncate_char(dest, initial_pos); } str_append(dest, "..."); } }
int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) { static unichar_t lowest_valid_chr_table[] = { 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 }; const unsigned char *input = _input; unichar_t chr, lowest_valid_chr; unsigned int i, len; int ret; i_assert(max_len > 0); if (*input < 0x80) { *chr_r = *input; return 1; } /* first byte has len highest bits set, followed by zero bit. the rest of the bits are used as the highest bits of the value. */ chr = *input; len = uni_utf8_char_bytes(*input); switch (len) { case 2: chr &= 0x1f; break; case 3: chr &= 0x0f; break; case 4: chr &= 0x07; break; case 5: chr &= 0x03; break; case 6: chr &= 0x01; break; default: /* only 7bit chars should have len==1 */ i_assert(len == 1); return -1; } if (len <= max_len) { lowest_valid_chr = lowest_valid_chr_table[len]; ret = len; } else { /* check first if the input is invalid before returning 0 */ lowest_valid_chr = 0; ret = 0; len = max_len; } /* the following bytes must all be 10xxxxxx */ for (i = 1; i < len; i++) { if ((input[i] & 0xc0) != 0x80) return input[i] == '\0' ? 0 : -1; chr <<= 6; chr |= input[i] & 0x3f; } if (chr < lowest_valid_chr) { /* overlong encoding */ return -1; } *chr_r = chr; return ret; }
static bool snippet_generate(struct snippet_context *ctx, const unsigned char *data, size_t size) { size_t i, count; if (ctx->html2text != NULL) { buffer_set_used_size(ctx->plain_output, 0); mail_html2text_more(ctx->html2text, data, size, ctx->plain_output); data = ctx->plain_output->data; size = ctx->plain_output->used; } /* message-decoder should feed us only valid and complete UTF-8 input */ for (i = 0; i < size; i += count) { count = 1; switch (ctx->state) { case SNIPPET_STATE_NEWLINE: if (data[i] == '>' && ctx->html2text == NULL) { ctx->state = SNIPPET_STATE_QUOTED; break; } ctx->state = SNIPPET_STATE_NORMAL; /* fallthrough */ case SNIPPET_STATE_NORMAL: if (size-i >= 3 && ((data[i] == 0xEF && data[i+1] == 0xBB && data[i+2] == 0xBF) || (data[i] == 0xBF && data[i+1] == 0xBB && data[i+2] == 0xEF))) { count += 2; /* because we skip +1 next */ break; } if (data[i] == '\r' || data[i] == '\n' || data[i] == '\t' || data[i] == ' ') { /* skip any leading whitespace */ if (str_len(ctx->snippet) > 1) ctx->add_whitespace = TRUE; if (data[i] == '\n') ctx->state = SNIPPET_STATE_NEWLINE; break; } if (ctx->add_whitespace) { str_append_c(ctx->snippet, ' '); ctx->add_whitespace = FALSE; if (ctx->chars_left-- == 0) return FALSE; } if (ctx->chars_left-- == 0) return FALSE; count = uni_utf8_char_bytes(data[i]); i_assert(i + count <= size); str_append_n(ctx->snippet, data + i, count); break; case SNIPPET_STATE_QUOTED: if (data[i] == '\n') ctx->state = SNIPPET_STATE_NEWLINE; break; } } return TRUE; }