Пример #1
0
void test_unichar(void)
{
	static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1";
	static const char collate_in[] = "\xc3\xbc \xc2\xb3";
	static const char collate_exp[] = "U\xcc\x88 3";
	buffer_t *collate_out;
	unichar_t chr, chr2;
	string_t *str = t_str_new(16);

	test_begin("unichars encode/decode");
	for (chr = 0; chr <= 0x10ffff; chr++) {
		/* The bottom 6 bits should be irrelevant to code coverage,
		   only test 000000, 111111, and something in between. */
		if ((chr & 63) == 1)
			chr += rand() % 62; /* After 0, somewhere between 1 and 62 */
		else if ((chr & 63) > 0 && (chr & 63) < 63)
			chr |= 63; /* After random, straight to 63 */

		str_truncate(str, 0);
		uni_ucs4_to_utf8_c(chr, str);
		test_assert(uni_utf8_str_is_valid(str_c(str)));
		test_assert(uni_utf8_get_char(str_c(str), &chr2) == (int)uni_utf8_char_bytes(*str_data(str)));
		test_assert(chr2 == chr);

		if ((chr & 0x63) == 0) {
			unsigned int utf8len = uni_utf8_char_bytes(*str_c(str));

			/* virtually truncate the byte string */
			while (--utf8len > 0)
				test_assert(uni_utf8_get_char_n(str_c(str), utf8len, &chr2) == 0);

			utf8len = uni_utf8_char_bytes(*str_c(str));

			/* actually truncate the byte stream */
			while (--utf8len > 0) {
				str_truncate(str, utf8len);
				test_assert(!uni_utf8_str_is_valid(str_c(str)));
				test_assert(uni_utf8_get_char(str_c(str), &chr2) == 0);
			}
		}
	}
	test_end();

	test_begin("unichar collation");
	collate_out = buffer_create_dynamic(default_pool, 32);
	uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in),
					 collate_out);
	test_assert(!strcmp(collate_out->data, collate_exp));
	buffer_free(&collate_out);

	test_assert(!uni_utf8_str_is_valid(overlong_utf8));
	test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0);
	test_end();

	test_unichar_uni_utf8_strlen();
	test_unichar_uni_utf8_partial_strlen_n();
}
Пример #2
0
void str_sanitize_append(string_t *dest, const char *src, size_t max_len)
{
	unsigned int len;
	unichar_t chr;
	size_t i;
	int ret;

	for (i = 0; i < max_len && src[i] != '\0'; ) {
		len = uni_utf8_char_bytes(src[i]);
		ret = uni_utf8_get_char(src+i, &chr);
		if (ret <= 0) {
			/* invalid UTF-8 */
			str_append_c(dest, '?');
			if (ret == 0) {
				/* input ended too early */
				return;
			}
			i++;
			continue;
		}
		if ((unsigned char)src[i] < 32)
			str_append_c(dest, '?');
		else
			str_append_c(dest, src[i]);
		i += len;
	}

	if (src[i] != '\0') {
		str_truncate(dest, str_len(dest) <= 3 ? 0 : str_len(dest)-3);
		str_append(dest, "...");
	}
}
Пример #3
0
static void
xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
{
	unichar_t chr;
	unsigned int i;

	for (i = 0; i < len; i++) {
		switch (data[i]) {
		case '&':
			str_append(dest, "&amp;");
			break;
		case '<':
			str_append(dest, "&lt;");
			break;
		case '>':
			str_append(dest, "&gt;");
			break;
		case '\t':
		case '\n':
		case '\r':
			/* exceptions to the following control char check */
			str_append_c(dest, data[i]);
			break;
		default:
			if (data[i] < 32) {
				/* SOLR doesn't like control characters.
				   replace them with spaces. */
				str_append_c(dest, ' ');
			} else if (data[i] >= 0x80) {
				/* make sure the character is valid for XML
				   so we don't get XML parser errors */
				unsigned int char_len =
					uni_utf8_char_bytes(data[i]);
				if (i + char_len <= len &&
				    uni_utf8_get_char_n(data + i, char_len, &chr) == 1 &&
				    is_valid_xml_char(chr))
					str_append_n(dest, data + i, char_len);
				else {
					str_append_n(dest, utf8_replacement_char,
						     UTF8_REPLACEMENT_CHAR_LEN);
				}
				i += char_len - 1;
			} else {
				str_append_c(dest, data[i]);
			}
			break;
		}
	}
}
Пример #4
0
static size_t str_sanitize_skip_start(const char *src, size_t max_len)
{
	unsigned int len;
	unichar_t chr;
	size_t i;

	for (i = 0; i < max_len; ) {
		len = uni_utf8_char_bytes(src[i]);
		if (uni_utf8_get_char(src+i, &chr) <= 0)
			break;
		if ((unsigned char)src[i] < 32)
			break;
		i += len;
	}
	return i;
}
Пример #5
0
static size_t str_sanitize_skip_start(const char *src, size_t max_bytes)
{
    unsigned int len;
    unichar_t chr;
    size_t i;

    for (i = 0; src[i] != '\0'; ) {
        len = uni_utf8_char_bytes(src[i]);
        if (i + len > max_bytes || uni_utf8_get_char(src+i, &chr) <= 0)
            break;
        if ((unsigned char)src[i] < 32)
            break;
        i += len;
    }
    i_assert(i <= max_bytes);
    return i;
}
Пример #6
0
void
fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
					   size_t *len)
{
	size_t pos;
	unsigned int char_bytes;

	/* the token is truncated - make sure the last character
	   exists entirely in the token */
	for (pos = *len-1; pos > 0; pos--) {
		if (UTF8_IS_START_SEQ(data[pos]))
			break;
	}
	char_bytes = uni_utf8_char_bytes(data[pos]);
	if (char_bytes != *len-pos) {
		i_assert(char_bytes > *len-pos);
		*len = pos;
	}
}
Пример #7
0
void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes)
{
    unsigned int len, initial_pos = str_len(dest);
    unichar_t chr;
    size_t i;
    int ret;

    for (i = 0; src[i] != '\0'; ) {
        len = uni_utf8_char_bytes(src[i]);
        if (i + len > max_bytes)
            break;
        ret = uni_utf8_get_char(src+i, &chr);
        if (ret <= 0) {
            /* invalid UTF-8 */
            str_append_c(dest, '?');
            if (ret == 0) {
                /* input ended too early */
                return;
            }
            i++;
            continue;
        }
        if ((unsigned char)src[i] < 32)
            str_append_c(dest, '?');
        else
            str_append_n(dest, src+i, len);
        i += len;
    }

    if (src[i] != '\0') {
        if (max_bytes < 3)
            str_truncate(dest, initial_pos);
        else {
            while (str_len(dest) - initial_pos > max_bytes-3)
                str_sanitize_truncate_char(dest, initial_pos);
        }
        str_append(dest, "...");
    }
}
Пример #8
0
int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
{
	static unichar_t lowest_valid_chr_table[] =
		{ 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
	const unsigned char *input = _input;
	unichar_t chr, lowest_valid_chr;
	unsigned int i, len;
	int ret;

	i_assert(max_len > 0);

	if (*input < 0x80) {
		*chr_r = *input;
		return 1;
	}

	/* first byte has len highest bits set, followed by zero bit.
	   the rest of the bits are used as the highest bits of the value. */
	chr = *input;
	len = uni_utf8_char_bytes(*input);
	switch (len) {
	case 2:
		chr &= 0x1f;
		break;
	case 3:
		chr &= 0x0f;
		break;
	case 4:
		chr &= 0x07;
		break;
	case 5:
		chr &= 0x03;
		break;
	case 6:
		chr &= 0x01;
		break;
	default:
		/* only 7bit chars should have len==1 */
		i_assert(len == 1);
		return -1;
	}

	if (len <= max_len) {
		lowest_valid_chr = lowest_valid_chr_table[len];
		ret = len;
	} else {
		/* check first if the input is invalid before returning 0 */
		lowest_valid_chr = 0;
		ret = 0;
		len = max_len;
	}

	/* the following bytes must all be 10xxxxxx */
	for (i = 1; i < len; i++) {
		if ((input[i] & 0xc0) != 0x80)
			return input[i] == '\0' ? 0 : -1;

		chr <<= 6;
		chr |= input[i] & 0x3f;
	}
	if (chr < lowest_valid_chr) {
		/* overlong encoding */
		return -1;
	}

	*chr_r = chr;
	return ret;
}
Пример #9
0
static bool snippet_generate(struct snippet_context *ctx,
			     const unsigned char *data, size_t size)
{
	size_t i, count;

	if (ctx->html2text != NULL) {
		buffer_set_used_size(ctx->plain_output, 0);
		mail_html2text_more(ctx->html2text, data, size,
				    ctx->plain_output);
		data = ctx->plain_output->data;
		size = ctx->plain_output->used;
	}

	/* message-decoder should feed us only valid and complete
	   UTF-8 input */

	for (i = 0; i < size; i += count) {
		count = 1;
		switch (ctx->state) {
		case SNIPPET_STATE_NEWLINE:
			if (data[i] == '>' && ctx->html2text == NULL) {
				ctx->state = SNIPPET_STATE_QUOTED;
				break;
			}
			ctx->state = SNIPPET_STATE_NORMAL;
			/* fallthrough */
		case SNIPPET_STATE_NORMAL:
			if (size-i >= 3 &&
			     ((data[i] == 0xEF && data[i+1] == 0xBB && data[i+2] == 0xBF) ||
			      (data[i] == 0xBF && data[i+1] == 0xBB && data[i+2] == 0xEF))) {
				count += 2; /* because we skip +1 next */
				break;
			}
			if (data[i] == '\r' || data[i] == '\n' ||
			    data[i] == '\t' || data[i] == ' ') {
				/* skip any leading whitespace */
				if (str_len(ctx->snippet) > 1)
					ctx->add_whitespace = TRUE;
				if (data[i] == '\n')
					ctx->state = SNIPPET_STATE_NEWLINE;
				break;
			}
			if (ctx->add_whitespace) {
				str_append_c(ctx->snippet, ' ');
				ctx->add_whitespace = FALSE;
				if (ctx->chars_left-- == 0)
					return FALSE;
			}
			if (ctx->chars_left-- == 0)
				return FALSE;
			count = uni_utf8_char_bytes(data[i]);
			i_assert(i + count <= size);
			str_append_n(ctx->snippet, data + i, count);
			break;
		case SNIPPET_STATE_QUOTED:
			if (data[i] == '\n')
				ctx->state = SNIPPET_STATE_NEWLINE;
			break;
		}
	}
	return TRUE;
}