Пример #1
0
Файл: utf8.c Проект: Gunga/urbit
// Validate a single UTF-8 character according to RFC 3629.
static int utf8proc_valid(const uint8_t *str, int str_len)
{
	int length = utf8proc_charlen(str, str_len);

	if (length <= 0)
		return length;

	switch (length) {
	case 1:
		if (str[0] == 0x00) {
			// ASCII NUL is technically valid but rejected
			// for security reasons.
			return -length;
		}
		break;

	case 2:
		if (str[0] < 0xC2) {
			// Overlong
			return -length;
		}
		break;

	case 3:
		if (str[0] == 0xE0) {
			if (str[1] < 0xA0) {
				// Overlong
				return -length;
			}
		}
		else if (str[0] == 0xED) {
			if (str[1] >= 0xA0) {
				// Surrogate
				return -length;
			}
		}
		break;

	case 4:
		if (str[0] == 0xF0) {
			if (str[1] < 0x90) {
				// Overlong
				return -length;
			}
		}
		else if (str[0] >= 0xF4) {
			if (str[0] > 0xF4 || str[1] >= 0x90) {
				// Above 0x10FFFF
				return -length;
			}
		}
		break;
	}

	return length;
}
Пример #2
0
void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
{
	static const uint8_t whitespace[] = "    ";

	size_t i = 0, tab = 0;

	while (i < size) {
		size_t org = i;

		while (i < size && line[i] != '\t' && line[i] <= 0x80) {
			i++; tab++;
		}

		if (i > org)
			strbuf_put(ob, line + org, i - org);

		if (i >= size)
			break;

		if (line[i] == '\t') {
			int numspaces = 4 - (tab % 4);
			strbuf_put(ob, whitespace, numspaces);
			i += 1;
			tab += numspaces;
		} else {
			int charlen = utf8proc_charlen(line + i, size - i);

			if (charlen >= 0) {
				strbuf_put(ob, line + i, charlen);
			} else {
				encode_unknown(ob);
				charlen = -charlen;
			}

			i += charlen;
			tab += 1;
		}
	}
}
Пример #3
0
int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst)
{
	int length;
	int32_t uc = -1;

	*dst = -1;
	length = utf8proc_charlen(str, str_len);
	if (length < 0)
		return -1;

	switch (length) {
	case 1:
		uc = str[0];
		break;
	case 2:
		uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
		if (uc < 0x80) uc = -1;
		break;
	case 3:
		uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
			+ (str[2] & 0x3F);
		if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
		    (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
		break;
	case 4:
		uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
			+ ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
		if (uc < 0x10000 || uc >= 0x110000) uc = -1;
		break;
	}

	if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
		return -1;

	*dst = uc;
	return length;
}