Example #1
0
int
pdf_lex(fz_stream *f, pdf_lexbuf *buf)
{
	while (1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case EOF:
			return PDF_TOK_EOF;
		case IS_WHITE:
			lex_white(f);
			break;
		case '%':
			lex_comment(f);
			break;
		case '/':
			lex_name(f, buf);
			return PDF_TOK_NAME;
		case '(':
			return lex_string(f, buf);
		case ')':
			fz_warn(f->ctx, "lexical error (unexpected ')')");
			continue;
		case '<':
			c = fz_read_byte(f);
			if (c == '<')
			{
				return PDF_TOK_OPEN_DICT;
			}
			else
			{
				fz_unread_byte(f);
				return lex_hex_string(f, buf);
			}
		case '>':
			c = fz_read_byte(f);
			if (c == '>')
			{
				return PDF_TOK_CLOSE_DICT;
			}
			fz_warn(f->ctx, "lexical error (unexpected '>')");
			continue;
		case '[':
			return PDF_TOK_OPEN_ARRAY;
		case ']':
			return PDF_TOK_CLOSE_ARRAY;
		case '{':
			return PDF_TOK_OPEN_BRACE;
		case '}':
			return PDF_TOK_CLOSE_BRACE;
		case IS_NUMBER:
			return lex_number(f, buf, c);
		default: /* isregular: !isdelim && !iswhite && c != EOF */
			fz_unread_byte(f);
			lex_name(f, buf);
			return pdf_token_from_keyword(buf->scratch);
		}
	}
}
Example #2
0
static void
lex_white(fz_stream *f)
{
	int c;
	do {
		c = fz_read_byte(f);
	} while ((c <= 32) && (iswhite(c)));
	if (c != EOF)
		fz_unread_byte(f);
}
Example #3
0
static int
lex_number(fz_stream *f, pdf_lexbuf *buf, int c)
{
	int neg = 0;
	int i = 0;
	int n;
	int d;
	float v;

	/* Initially we might have +, -, . or a digit */
	switch (c)
	{
	case '.':
		goto loop_after_dot;
	case '-':
		neg = 1;
		break;
	case '+':
		break;
	default: /* Must be a digit */
		i = c - '0';
		break;
	}

	while (1)
	{
		c = fz_read_byte(f);
		switch (c)
		{
		case '.':
			goto loop_after_dot;
		case RANGE_0_9:
			i = 10*i + c - '0';
			/* FIXME: Need overflow check here; do we care? */
			break;
		default:
			fz_unread_byte(f);
			/* Fallthrough */
		case EOF:
			if (neg)
				i = -i;
			buf->i = i;
			return PDF_TOK_INT;
		}
	}

	/* In here, we've seen a dot, so can accept just digits */
loop_after_dot:
	n = 0;
	d = 1;
	while (1)
	{
		c = fz_read_byte(f);
		switch (c)
		{
		case RANGE_0_9:
			if (d >= INT_MAX/10)
				goto underflow;
			n = n*10 + (c - '0');
			d *= 10;
			break;
		default:
			fz_unread_byte(f);
			/* Fallthrough */
		case EOF:
			v = (float)i + ((float)n / (float)d);
			if (neg)
				v = -v;
			buf->f = v;
			return PDF_TOK_REAL;
		}
	}

underflow:
	/* Ignore any digits after here, because they are too small */
	while (1)
	{
		c = fz_read_byte(f);
		switch (c)
		{
		case RANGE_0_9:
			break;
		default:
			fz_unread_byte(f);
			/* Fallthrough */
		case EOF:
			v = (float)i + ((float)n / (float)d);
			if (neg)
				v = -v;
			buf->f = v;
			return PDF_TOK_REAL;
		}
	}
}
Example #4
0
pdf_token
pdf_lex(fz_stream *f, pdf_lexbuf *buf)
{
	while (1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case EOF:
			return PDF_TOK_EOF;
		case IS_WHITE:
			lex_white(f);
			break;
		case '%':
			lex_comment(f);
			break;
		case '/':
			lex_name(f, buf);
			return PDF_TOK_NAME;
		case '(':
			return lex_string(f, buf);
		case ')':
			fz_warn(f->ctx, "lexical error (unexpected ')')");
			continue;
		case '<':
			c = fz_read_byte(f);
			if (c == '<')
			{
				return PDF_TOK_OPEN_DICT;
			}
			else
			{
				fz_unread_byte(f);
				return lex_hex_string(f, buf);
			}
		case '>':
			c = fz_read_byte(f);
			if (c == '>')
			{
				return PDF_TOK_CLOSE_DICT;
			}
			fz_warn(f->ctx, "lexical error (unexpected '>')");
			if (c == EOF)
			{
				return PDF_TOK_EOF;
			}
			fz_unread_byte(f);
			continue;
		case '[':
			return PDF_TOK_OPEN_ARRAY;
		case ']':
			return PDF_TOK_CLOSE_ARRAY;
		case '{':
			return PDF_TOK_OPEN_BRACE;
		case '}':
			return PDF_TOK_CLOSE_BRACE;
		case IS_NUMBER:
			/* cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2231 */
			{
				int tok = lex_number(f, buf, c);
				while (1)
				{
					c = fz_peek_byte(f);
					switch (c)
					{
					case IS_NUMBER:
						fz_warn(f->ctx, "ignoring invalid character after number: '%c'", c);
						fz_read_byte(f);
						continue;
					default:
						return tok;
					}
				}
			}
		default: /* isregular: !isdelim && !iswhite && c != EOF */
			fz_unread_byte(f);
			lex_name(f, buf);
			return pdf_token_from_keyword(buf->scratch);
		}
	}
}
Example #5
0
static int
lex_string(fz_stream *f, pdf_lexbuf *lb)
{
	char *s = lb->scratch;
	char *e = s + lb->size;
	int bal = 1;
	int oct;
	int c;

	while (1)
	{
		if (s == e)
		{
			s += pdf_lexbuf_grow(lb);
			e = lb->scratch + lb->size;
		}
		c = fz_read_byte(f);
		switch (c)
		{
		case EOF:
			goto end;
		case '(':
			bal++;
			*s++ = c;
			break;
		case ')':
			bal --;
			if (bal == 0)
				goto end;
			*s++ = c;
			break;
		case '\\':
			c = fz_read_byte(f);
			switch (c)
			{
			case EOF:
				goto end;
			case 'n':
				*s++ = '\n';
				break;
			case 'r':
				*s++ = '\r';
				break;
			case 't':
				*s++ = '\t';
				break;
			case 'b':
				*s++ = '\b';
				break;
			case 'f':
				*s++ = '\f';
				break;
			case '(':
				*s++ = '(';
				break;
			case ')':
				*s++ = ')';
				break;
			case '\\':
				*s++ = '\\';
				break;
			case RANGE_0_7:
				oct = c - '0';
				c = fz_read_byte(f);
				if (c >= '0' && c <= '7')
				{
					oct = oct * 8 + (c - '0');
					c = fz_read_byte(f);
					if (c >= '0' && c <= '7')
						oct = oct * 8 + (c - '0');
					else if (c != EOF)
						fz_unread_byte(f);
				}
				else if (c != EOF)
					fz_unread_byte(f);
				*s++ = oct;
				break;
			case '\n':
				break;
			case '\r':
				c = fz_read_byte(f);
				if ((c != '\n') && (c != EOF))
					fz_unread_byte(f);
				break;
			default:
				*s++ = c;
			}
			break;
		default:
			*s++ = c;
			break;
		}
	}
end:
	lb->len = s - lb->scratch;
	return PDF_TOK_STRING;
}
Example #6
0
static void
lex_name(fz_stream *f, pdf_lexbuf *buf)
{
	char *s = buf->scratch;
	int n = buf->size;

	while (n > 1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case IS_WHITE:
		case IS_DELIM:
			fz_unread_byte(f);
			goto end;
		case EOF:
			goto end;
		case '#':
		{
			int d;
			c = fz_read_byte(f);
			switch (c)
			{
			case RANGE_0_9:
				d = (c - '0') << 4;
				break;
			case RANGE_a_f:
				d = (c - 'a' + 10) << 4;
				break;
			case RANGE_A_F:
				d = (c - 'A' + 10) << 4;
				break;
			/* cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2300 */
			case '#':
				fz_unread_byte(f);
				*s++ = '#';
				n--;
				continue;
			default:
				fz_unread_byte(f);
				/* fallthrough */
			case EOF:
				goto end;
			}
			c = fz_read_byte(f);
			switch (c)
			{
			case RANGE_0_9:
				c -= '0';
				break;
			case RANGE_a_f:
				c -= 'a' - 10;
				break;
			case RANGE_A_F:
				c -= 'A' - 10;
				break;
			default:
				fz_unread_byte(f);
				/* fallthrough */
			case EOF:
				*s++ = d;
				n--;
				goto end;
			}
			*s++ = d + c;
			n--;
			break;
		}
		default:
			*s++ = c;
			n--;
			break;
		}
	}
end:
	*s = '\0';
	buf->len = s - buf->scratch;
}
Example #7
0
static int
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
{
	int neg = 0;
	fz_off_t i = 0;
	int n;
	int d;
	float v;
	float fd;

	/* Initially we might have +, -, . or a digit */
	switch (c)
	{
	case '.':
		goto loop_after_dot;
	case '-':
		neg = 1;
		break;
	case '+':
		break;
	default: /* Must be a digit */
		i = c - '0';
		break;
	}

	while (1)
	{
		c = fz_read_byte(ctx, f);
		switch (c)
		{
		case '.':
			goto loop_after_dot;
		case RANGE_0_9:
			/* We deliberately ignore overflow here. We tried
			 * code that returned INT_MIN/MAX as appropriate,
			 * but this causes loss of data (see Bug695950.pdf
			 * for an example). Tests show that Acrobat handles
			 * overflows in exactly the same way we do (i.e.
			 * 123450000000000000000678 is read as 678). */
			i = 10*i + c - '0';
			break;
		default:
			fz_unread_byte(ctx, f);
			/* Fallthrough */
		case EOF:
			if (neg)
				i = -i;
			buf->i = i;
			return PDF_TOK_INT;
		}
	}

	/* In here, we've seen a dot, so can accept just digits */
loop_after_dot:
	n = 0;
	d = 1;
	while (1)
	{
		c = fz_read_byte(ctx, f);
		switch (c)
		{
		case RANGE_0_9:
			if (d >= INT_MAX/10)
				goto underflow;
			n = n*10 + (c - '0');
			d *= 10;
			break;
		default:
			fz_unread_byte(ctx, f);
			/* Fallthrough */
		case EOF:
			v = (float)i + ((float)n / (float)d);
			if (neg)
				v = -v;
			buf->f = v;
			return PDF_TOK_REAL;
		}
	}

underflow:
	fd = 1 / (float)d;
	v = (float)i + ((float)n * fd);
	while (1)
	{
		c = fz_read_byte(ctx, f);
		switch (c)
		{
		case RANGE_0_9:
			fd /= 10;
			v += (c - '0') * fd;
			break;
		default:
			fz_unread_byte(ctx, f);
			/* Fallthrough */
		case EOF:
			if (neg)
				v = -v;
			buf->f = v;
			return PDF_TOK_REAL;
		}
	}
}
Example #8
0
static void
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
	char *s = buf->scratch;
	int n = buf->size;

	while (n > 1)
	{
		int c = fz_read_byte(ctx, f);
		switch (c)
		{
		case IS_WHITE:
		case IS_DELIM:
			fz_unread_byte(ctx, f);
			goto end;
		case EOF:
			goto end;
		case '#':
		{
			int d;
			c = fz_read_byte(ctx, f);
			switch (c)
			{
			case RANGE_0_9:
				d = (c - '0') << 4;
				break;
			case RANGE_a_f:
				d = (c - 'a' + 10) << 4;
				break;
			case RANGE_A_F:
				d = (c - 'A' + 10) << 4;
				break;
			default:
				fz_unread_byte(ctx, f);
				/* fallthrough */
			case EOF:
				goto end;
			}
			c = fz_read_byte(ctx, f);
			switch (c)
			{
			case RANGE_0_9:
				c -= '0';
				break;
			case RANGE_a_f:
				c -= 'a' - 10;
				break;
			case RANGE_A_F:
				c -= 'A' - 10;
				break;
			default:
				fz_unread_byte(ctx, f);
				/* fallthrough */
			case EOF:
				*s++ = d;
				n--;
				goto end;
			}
			*s++ = d + c;
			n--;
			break;
		}
		default:
			*s++ = c;
			n--;
			break;
		}
	}
end:
	*s = '\0';
	buf->len = s - buf->scratch;
}
Example #9
0
static int
lex_number(fz_stream *f, char *s, int n, int *tok)
{
	char *buf = s;
	*tok = PDF_TOK_INT;

	/* Initially we might have +, -, . or a digit */
	if (n > 1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case '.':
			*tok = PDF_TOK_REAL;
			*s++ = c;
			n--;
			goto loop_after_dot;
		case '+':
		case '-':
		case RANGE_0_9:
			*s++ = c;
			n--;
			goto loop_after_sign;
		default:
			fz_unread_byte(f);
			goto end;
		case EOF:
			goto end;
		}
	}

	/* We can't accept a sign from here on in, just . or a digit */
loop_after_sign:
	while (n > 1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case '.':
			*tok = PDF_TOK_REAL;
			*s++ = c;
			n--;
			goto loop_after_dot;
		case RANGE_0_9:
			*s++ = c;
			break;
		default:
			fz_unread_byte(f);
			goto end;
		case EOF:
			goto end;
		}
		n--;
	}

	/* In here, we've seen a dot, so can accept just digits */
loop_after_dot:
	while (n > 1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case RANGE_0_9:
			*s++ = c;
			break;
		default:
			fz_unread_byte(f);
			goto end;
		case EOF:
			goto end;
		}
		n--;
	}

end:
	*s = '\0';
	return s-buf;
}
Example #10
0
fz_error
pdf_lex(int *tok, fz_stream *f, char *buf, int n, int *sl)
{
	while (1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case EOF:
			*tok = PDF_TOK_EOF;
			return fz_okay;
		case IS_WHITE:
			lex_white(f);
			break;
		case '%':
			lex_comment(f);
			break;
		case '/':
			lex_name(f, buf, n);
			*sl = strlen(buf);
			*tok = PDF_TOK_NAME;
			return fz_okay;
		case '(':
			*sl = lex_string(f, buf, n);
			*tok = PDF_TOK_STRING;
			return fz_okay;
		case ')':
			*tok = PDF_TOK_ERROR;
			goto cleanuperror;
		case '<':
			c = fz_read_byte(f);
			if (c == '<')
			{
				*tok = PDF_TOK_OPEN_DICT;
			}
			else
			{
				fz_unread_byte(f);
				*sl = lex_hex_string(f, buf, n);
				*tok = PDF_TOK_STRING;
			}
			return fz_okay;
		case '>':
			c = fz_read_byte(f);
			if (c == '>')
			{
				*tok = PDF_TOK_CLOSE_DICT;
				return fz_okay;
			}
			*tok = PDF_TOK_ERROR;
			goto cleanuperror;
		case '[':
			*tok = PDF_TOK_OPEN_ARRAY;
			return fz_okay;
		case ']':
			*tok = PDF_TOK_CLOSE_ARRAY;
			return fz_okay;
		case '{':
			*tok = PDF_TOK_OPEN_BRACE;
			return fz_okay;
		case '}':
			*tok = PDF_TOK_CLOSE_BRACE;
			return fz_okay;
		case IS_NUMBER:
			fz_unread_byte(f);
			*sl = lex_number(f, buf, n, tok);
			return fz_okay;
		default: /* isregular: !isdelim && !iswhite && c != EOF */
			fz_unread_byte(f);
			lex_name(f, buf, n);
			*sl = strlen(buf);
			*tok = pdf_token_from_keyword(buf);
			return fz_okay;
		}
	}

cleanuperror:
	*tok = PDF_TOK_ERROR;
	return fz_throw("lexical error");
}
Example #11
0
static int
lex_string(fz_stream *f, char *buf, int n)
{
	char *s = buf;
	char *e = buf + n;
	int bal = 1;
	int oct;
	int c;

	while (s < e)
	{
		c = fz_read_byte(f);
		switch (c)
		{
		case EOF:
			goto end;
		case '(':
			bal++;
			*s++ = c;
			break;
		case ')':
			bal --;
			if (bal == 0)
				goto end;
			*s++ = c;
			break;
		case '\\':
			c = fz_read_byte(f);
			switch (c)
			{
			case EOF:
				goto end;
			case 'n':
				*s++ = '\n';
				break;
			case 'r':
				*s++ = '\r';
				break;
			case 't':
				*s++ = '\t';
				break;
			case 'b':
				*s++ = '\b';
				break;
			case 'f':
				*s++ = '\f';
				break;
			case '(':
				*s++ = '(';
				break;
			case ')':
				*s++ = ')';
				break;
			case '\\':
				*s++ = '\\';
				break;
			case RANGE_0_9:
				oct = c - '0';
				c = fz_read_byte(f);
				if (c >= '0' && c <= '9')
				{
					oct = oct * 8 + (c - '0');
					c = fz_read_byte(f);
					if (c >= '0' && c <= '9')
						oct = oct * 8 + (c - '0');
					else if (c != EOF)
						fz_unread_byte(f);
				}
				else if (c != EOF)
					fz_unread_byte(f);
				*s++ = oct;
				break;
			case '\n':
				break;
			case '\r':
				c = fz_read_byte(f);
				if ((c != '\n') && (c != EOF))
					fz_unread_byte(f);
				break;
			default:
				*s++ = c;
			}
			break;
		default:
			*s++ = c;
			break;
		}
	}
end:
	return s - buf;
}
Example #12
0
static void
lex_name(fz_stream *f, char *s, int n)
{
	while (n > 1)
	{
		int c = fz_read_byte(f);
		switch (c)
		{
		case IS_WHITE:
		case IS_DELIM:
			fz_unread_byte(f);
			goto end;
		case EOF:
			goto end;
		case '#':
		{
			int d;
			c = fz_read_byte(f);
			switch (c)
			{
			case RANGE_0_9:
				d = (c - '0') << 4;
				break;
			case RANGE_a_f:
				d = (c - 'a' + 10) << 4;
				break;
			case RANGE_A_F:
				d = (c - 'A' + 10) << 4;
				break;
			default:
				fz_unread_byte(f);
				/* fallthrough */
			case EOF:
				goto end;
			}
			c = fz_read_byte(f);
			switch (c)
			{
			case RANGE_0_9:
				c -= '0';
				break;
			case RANGE_a_f:
				c -= 'a' - 10;
				break;
			case RANGE_A_F:
				c -= 'A' - 10;
				break;
			default:
				fz_unread_byte(f);
				/* fallthrough */
			case EOF:
				*s++ = d;
				n--;
				goto end;
			}
			*s++ = d + c;
			n--;
			break;
		}
		default:
			*s++ = c;
			n--;
			break;
		}
	}
end:
	*s = '\0';
}