Esempio n. 1
0
/* {{{ get_next_char
 */
inline static unsigned int get_next_char(enum entity_charset charset,
		unsigned char * str,
		int str_len,
		int * newpos,
		unsigned char * mbseq,
		int * mbseqlen, 
		int *status)
{
	int pos = *newpos;
	int mbpos = 0;
	int mbspace = *mbseqlen;
	unsigned int this_char = 0;
	unsigned char next_char;

	*status = SUCCESS;

	if (mbspace <= 0) {
		*mbseqlen = 0;
		CHECK_LEN(pos, 1);
		*newpos = pos + 1;
		return str[pos];
	}

	switch (charset) {
		case cs_utf_8:
			{
				unsigned char c;
				CHECK_LEN(pos, 1);
				c = str[pos];
				if (c < 0x80) {
					MB_WRITE(c);
					this_char = c;
					pos++;
				} else if (c < 0xc0) {
					MB_FAILURE(pos);
				} else if (c < 0xe0) {
					CHECK_LEN(pos, 2);
					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
						MB_FAILURE(pos);
					}
					this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
					if (this_char < 0x80) {
						MB_FAILURE(pos);
					}
					MB_WRITE((unsigned char)c);
					MB_WRITE((unsigned char)str[pos + 1]);
					pos += 2;
				} else if (c < 0xf0) {
					CHECK_LEN(pos, 3);
					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
						MB_FAILURE(pos);
					}
					if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
						MB_FAILURE(pos);
					}
					this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
					if (this_char < 0x800) {
						MB_FAILURE(pos);
					} else if (this_char >= 0xd800 && this_char <= 0xdfff) {
						MB_FAILURE(pos);
					}
					MB_WRITE((unsigned char)c);
					MB_WRITE((unsigned char)str[pos + 1]);
					MB_WRITE((unsigned char)str[pos + 2]);
					pos += 3;
				} else if (c < 0xf8) {
					CHECK_LEN(pos, 4);
					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
						MB_FAILURE(pos);
					}
					if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
						MB_FAILURE(pos);
					}
					if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
						MB_FAILURE(pos);
					}
					this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
					if (this_char < 0x10000) {
						MB_FAILURE(pos);
					}
					MB_WRITE((unsigned char)c);
					MB_WRITE((unsigned char)str[pos + 1]);
					MB_WRITE((unsigned char)str[pos + 2]);
					MB_WRITE((unsigned char)str[pos + 3]);
					pos += 4;
				} else {
					MB_FAILURE(pos);
				}
			}
			break;
		case cs_big5:
		case cs_gb2312:
		case cs_big5hkscs:
			{
				CHECK_LEN(pos, 1);
				this_char = str[pos++];
				/* check if this is the first of a 2-byte sequence */
				if (this_char >= 0x81 && this_char <= 0xfe) {
					/* peek at the next char */
					CHECK_LEN(pos, 1);
					next_char = str[pos++];
					if ((next_char >= 0x40 && next_char <= 0x7e) ||
							(next_char >= 0xa1 && next_char <= 0xfe)) {
						/* yes, this a wide char */
						MB_WRITE(this_char);
						MB_WRITE(next_char);
						this_char = (this_char << 8) | next_char;
					} else {
						MB_FAILURE(pos);
					}
				} else {
					MB_WRITE(this_char);
				}
			}
			break;
		case cs_sjis:
			{
				CHECK_LEN(pos, 1);
				this_char = str[pos++];
				/* check if this is the first of a 2-byte sequence */
				if ((this_char >= 0x81 && this_char <= 0x9f) ||
					(this_char >= 0xe0 && this_char <= 0xfc)) {
					/* peek at the next char */
					CHECK_LEN(pos, 1);
					next_char = str[pos++];
					if ((next_char >= 0x40 && next_char <= 0x7e) ||
						(next_char >= 0x80 && next_char <= 0xfc))
					{
						/* yes, this a wide char */
						MB_WRITE(this_char);
						MB_WRITE(next_char);
						this_char = (this_char << 8) | next_char;
					} else {
						MB_FAILURE(pos);
					}
				} else {
					MB_WRITE(this_char);
				}
				break;
			}
		case cs_eucjp:
			{
				CHECK_LEN(pos, 1);
				this_char = str[pos++];
				/* check if this is the first of a multi-byte sequence */
				if (this_char >= 0xa1 && this_char <= 0xfe) {
					/* peek at the next char */
					CHECK_LEN(pos, 1);
					next_char = str[pos++];
					if (next_char >= 0xa1 && next_char <= 0xfe) {
						/* yes, this a jis kanji char */
						MB_WRITE(this_char);
						MB_WRITE(next_char);
						this_char = (this_char << 8) | next_char;
					} else {
						MB_FAILURE(pos);
					}
				} else if (this_char == 0x8e) {
					/* peek at the next char */
					CHECK_LEN(pos, 1);
					next_char = str[pos++];
					if (next_char >= 0xa1 && next_char <= 0xdf) {
						/* JIS X 0201 kana */
						MB_WRITE(this_char);
						MB_WRITE(next_char);
						this_char = (this_char << 8) | next_char;
					} else {
						MB_FAILURE(pos);
					}
				} else if (this_char == 0x8f) {
					/* peek at the next two char */
					unsigned char next2_char;
					CHECK_LEN(pos, 2);
					next_char = str[pos];
					next2_char = str[pos + 1];
					pos += 2;
					if ((next_char >= 0xa1 && next_char <= 0xfe) &&
						(next2_char >= 0xa1 && next2_char <= 0xfe)) {
						/* JIS X 0212 hojo-kanji */
						MB_WRITE(this_char);
						MB_WRITE(next_char);
						MB_WRITE(next2_char);
						this_char = (this_char << 16) | (next_char << 8) | next2_char;
					} else {
						MB_FAILURE(pos);
					}
				} else {
					MB_WRITE(this_char);
				}
				break;
			}
		default:
			/* single-byte charsets */
			CHECK_LEN(pos, 1);
			this_char = str[pos++];
			MB_WRITE(this_char);
			break;
	}
	MB_RETURN;
}
Esempio n. 2
0
File: html.c Progetto: 20uf/php-src
/* {{{ get_next_char
 */
static inline unsigned int get_next_char(
		enum entity_charset charset,
		const unsigned char *str,
		size_t str_len,
		size_t *cursor,
		int *status)
{
	size_t pos = *cursor;
	unsigned int this_char = 0;

	*status = SUCCESS;
	assert(pos <= str_len);

	if (!CHECK_LEN(pos, 1))
		MB_FAILURE(pos, 1);

	switch (charset) {
	case cs_utf_8:
		{
			/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
			 * "In a reported illegal byte sequence, do not include any
			 *  non-initial byte that encodes a valid character or is a leading
			 *  byte for a valid sequence." */
			unsigned char c;
			c = str[pos];
			if (c < 0x80) {
				this_char = c;
				pos++;
			} else if (c < 0xc2) {
				MB_FAILURE(pos, 1);
			} else if (c < 0xe0) {
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				if (!utf8_trail(str[pos + 1])) {
					MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
				}
				this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
				if (this_char < 0x80) { /* non-shortest form */
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else if (c < 0xf0) {
				size_t avail = str_len - pos;

				if (avail < 3 ||
						!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
					if (avail < 2 || utf8_lead(str[pos + 1]))
						MB_FAILURE(pos, 1);
					else if (avail < 3 || utf8_lead(str[pos + 2]))
						MB_FAILURE(pos, 2);
					else
						MB_FAILURE(pos, 3);
				}

				this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
				if (this_char < 0x800) { /* non-shortest form */
					MB_FAILURE(pos, 3);
				} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
					MB_FAILURE(pos, 3);
				}
				pos += 3;
			} else if (c < 0xf5) {
				size_t avail = str_len - pos;

				if (avail < 4 ||
						!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
						!utf8_trail(str[pos + 3])) {
					if (avail < 2 || utf8_lead(str[pos + 1]))
						MB_FAILURE(pos, 1);
					else if (avail < 3 || utf8_lead(str[pos + 2]))
						MB_FAILURE(pos, 2);
					else if (avail < 4 || utf8_lead(str[pos + 3]))
						MB_FAILURE(pos, 3);
					else
						MB_FAILURE(pos, 4);
				}

				this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
				if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
					MB_FAILURE(pos, 4);
				}
				pos += 4;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;

	case cs_big5:
		/* reference http://demo.icu-project.org/icu-bin/convexp?conv=big5 */
		{
			unsigned char c = str[pos];
			if (c >= 0x81 && c <= 0xFE) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if ((next >= 0x40 && next <= 0x7E) ||
						(next >= 0xA1 && next <= 0xFE)) {
					this_char = (c << 8) | next;
				} else {
					MB_FAILURE(pos, 1);
				}
				pos += 2;
			} else {
				this_char = c;
				pos += 1;
			}
		}
		break;

	case cs_big5hkscs:
		{
			unsigned char c = str[pos];
			if (c >= 0x81 && c <= 0xFE) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if ((next >= 0x40 && next <= 0x7E) ||
						(next >= 0xA1 && next <= 0xFE)) {
					this_char = (c << 8) | next;
				} else if (next != 0x80 && next != 0xFF) {
					MB_FAILURE(pos, 1);
				} else {
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else {
				this_char = c;
				pos += 1;
			}
		}
		break;

	case cs_gb2312: /* EUC-CN */
		{
			unsigned char c = str[pos];
			if (c >= 0xA1 && c <= 0xFE) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if (gb2312_trail(next)) {
					this_char = (c << 8) | next;
				} else if (gb2312_lead(next)) {
					MB_FAILURE(pos, 1);
				} else {
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else if (gb2312_lead(c)) {
				this_char = c;
				pos += 1;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;

	case cs_sjis:
		{
			unsigned char c = str[pos];
			if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xFC)) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if (sjis_trail(next)) {
					this_char = (c << 8) | next;
				} else if (sjis_lead(next)) {
					MB_FAILURE(pos, 1);
				} else {
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else if (c < 0x80 || (c >= 0xA1 && c <= 0xDF)) {
				this_char = c;
				pos += 1;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;

	case cs_eucjp:
		{
			unsigned char c = str[pos];

			if (c >= 0xA1 && c <= 0xFE) {
				unsigned next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);
				next = str[pos + 1];

				if (next >= 0xA1 && next <= 0xFE) {
					/* this a jis kanji char */
					this_char = (c << 8) | next;
				} else {
					MB_FAILURE(pos, (next != 0xA0 && next != 0xFF) ? 1 : 2);
				}
				pos += 2;
			} else if (c == 0x8E) {
				unsigned next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];
				if (next >= 0xA1 && next <= 0xDF) {
					/* JIS X 0201 kana */
					this_char = (c << 8) | next;
				} else {
					MB_FAILURE(pos, (next != 0xA0 && next != 0xFF) ? 1 : 2);
				}
				pos += 2;
			} else if (c == 0x8F) {
				size_t avail = str_len - pos;

				if (avail < 3 || !(str[pos + 1] >= 0xA1 && str[pos + 1] <= 0xFE) ||
						!(str[pos + 2] >= 0xA1 && str[pos + 2] <= 0xFE)) {
					if (avail < 2 || (str[pos + 1] != 0xA0 && str[pos + 1] != 0xFF))
						MB_FAILURE(pos, 1);
					else if (avail < 3 || (str[pos + 2] != 0xA0 && str[pos + 2] != 0xFF))
						MB_FAILURE(pos, 2);
					else
						MB_FAILURE(pos, 3);
				} else {
					/* JIS X 0212 hojo-kanji */
					this_char = (c << 16) | (str[pos + 1] << 8) | str[pos + 2];
				}
				pos += 3;
			} else if (c != 0xA0 && c != 0xFF) {
				/* character encoded in 1 code unit */
				this_char = c;
				pos += 1;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;
	default:
		/* single-byte charsets */
		this_char = str[pos++];
		break;
	}

	*cursor = pos;
  	return this_char;
}