int pdf_lex(fz_stream *f, pdf_lexbuf *buf) { while (1) { int c = fz_read_byte(f); switch (c) { case EOF: return PDF_TOK_EOF; case IS_WHITE: lex_white(f); break; case '%': lex_comment(f); break; case '/': lex_name(f, buf); return PDF_TOK_NAME; case '(': return lex_string(f, buf); case ')': fz_warn(f->ctx, "lexical error (unexpected ')')"); continue; case '<': c = fz_read_byte(f); if (c == '<') { return PDF_TOK_OPEN_DICT; } else { fz_unread_byte(f); return lex_hex_string(f, buf); } case '>': c = fz_read_byte(f); if (c == '>') { return PDF_TOK_CLOSE_DICT; } fz_warn(f->ctx, "lexical error (unexpected '>')"); continue; case '[': return PDF_TOK_OPEN_ARRAY; case ']': return PDF_TOK_CLOSE_ARRAY; case '{': return PDF_TOK_OPEN_BRACE; case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: return lex_number(f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(f); lex_name(f, buf); return pdf_token_from_keyword(buf->scratch); } } }
static void lex_white(fz_stream *f) { int c; do { c = fz_read_byte(f); } while ((c <= 32) && (iswhite(c))); if (c != EOF) fz_unread_byte(f); }
static int lex_number(fz_stream *f, pdf_lexbuf *buf, int c) { int neg = 0; int i = 0; int n; int d; float v; /* Initially we might have +, -, . or a digit */ switch (c) { case '.': goto loop_after_dot; case '-': neg = 1; break; case '+': break; default: /* Must be a digit */ i = c - '0'; break; } while (1) { c = fz_read_byte(f); switch (c) { case '.': goto loop_after_dot; case RANGE_0_9: i = 10*i + c - '0'; /* FIXME: Need overflow check here; do we care? */ break; default: fz_unread_byte(f); /* Fallthrough */ case EOF: if (neg) i = -i; buf->i = i; return PDF_TOK_INT; } } /* In here, we've seen a dot, so can accept just digits */ loop_after_dot: n = 0; d = 1; while (1) { c = fz_read_byte(f); switch (c) { case RANGE_0_9: if (d >= INT_MAX/10) goto underflow; n = n*10 + (c - '0'); d *= 10; break; default: fz_unread_byte(f); /* Fallthrough */ case EOF: v = (float)i + ((float)n / (float)d); if (neg) v = -v; buf->f = v; return PDF_TOK_REAL; } } underflow: /* Ignore any digits after here, because they are too small */ while (1) { c = fz_read_byte(f); switch (c) { case RANGE_0_9: break; default: fz_unread_byte(f); /* Fallthrough */ case EOF: v = (float)i + ((float)n / (float)d); if (neg) v = -v; buf->f = v; return PDF_TOK_REAL; } } }
pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *buf) { while (1) { int c = fz_read_byte(f); switch (c) { case EOF: return PDF_TOK_EOF; case IS_WHITE: lex_white(f); break; case '%': lex_comment(f); break; case '/': lex_name(f, buf); return PDF_TOK_NAME; case '(': return lex_string(f, buf); case ')': fz_warn(f->ctx, "lexical error (unexpected ')')"); continue; case '<': c = fz_read_byte(f); if (c == '<') { return PDF_TOK_OPEN_DICT; } else { fz_unread_byte(f); return lex_hex_string(f, buf); } case '>': c = fz_read_byte(f); if (c == '>') { return PDF_TOK_CLOSE_DICT; } fz_warn(f->ctx, "lexical error (unexpected '>')"); if (c == EOF) { return PDF_TOK_EOF; } fz_unread_byte(f); continue; case '[': return PDF_TOK_OPEN_ARRAY; case ']': return PDF_TOK_CLOSE_ARRAY; case '{': return PDF_TOK_OPEN_BRACE; case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: /* cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2231 */ { int tok = lex_number(f, buf, c); while (1) { c = fz_peek_byte(f); switch (c) { case IS_NUMBER: fz_warn(f->ctx, "ignoring invalid character after number: '%c'", c); fz_read_byte(f); continue; default: return tok; } } } default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(f); lex_name(f, buf); return pdf_token_from_keyword(buf->scratch); } } }
static int lex_string(fz_stream *f, pdf_lexbuf *lb) { char *s = lb->scratch; char *e = s + lb->size; int bal = 1; int oct; int c; while (1) { if (s == e) { s += pdf_lexbuf_grow(lb); e = lb->scratch + lb->size; } c = fz_read_byte(f); switch (c) { case EOF: goto end; case '(': bal++; *s++ = c; break; case ')': bal --; if (bal == 0) goto end; *s++ = c; break; case '\\': c = fz_read_byte(f); switch (c) { case EOF: goto end; case 'n': *s++ = '\n'; break; case 'r': *s++ = '\r'; break; case 't': *s++ = '\t'; break; case 'b': *s++ = '\b'; break; case 'f': *s++ = '\f'; break; case '(': *s++ = '('; break; case ')': *s++ = ')'; break; case '\\': *s++ = '\\'; break; case RANGE_0_7: oct = c - '0'; c = fz_read_byte(f); if (c >= '0' && c <= '7') { oct = oct * 8 + (c - '0'); c = fz_read_byte(f); if (c >= '0' && c <= '7') oct = oct * 8 + (c - '0'); else if (c != EOF) fz_unread_byte(f); } else if (c != EOF) fz_unread_byte(f); *s++ = oct; break; case '\n': break; case '\r': c = fz_read_byte(f); if ((c != '\n') && (c != EOF)) fz_unread_byte(f); break; default: *s++ = c; } break; default: *s++ = c; break; } } end: lb->len = s - lb->scratch; return PDF_TOK_STRING; }
static void lex_name(fz_stream *f, pdf_lexbuf *buf) { char *s = buf->scratch; int n = buf->size; while (n > 1) { int c = fz_read_byte(f); switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(f); goto end; case EOF: goto end; case '#': { int d; c = fz_read_byte(f); switch (c) { case RANGE_0_9: d = (c - '0') << 4; break; case RANGE_a_f: d = (c - 'a' + 10) << 4; break; case RANGE_A_F: d = (c - 'A' + 10) << 4; break; /* cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2300 */ case '#': fz_unread_byte(f); *s++ = '#'; n--; continue; default: fz_unread_byte(f); /* fallthrough */ case EOF: goto end; } c = fz_read_byte(f); switch (c) { case RANGE_0_9: c -= '0'; break; case RANGE_a_f: c -= 'a' - 10; break; case RANGE_A_F: c -= 'A' - 10; break; default: fz_unread_byte(f); /* fallthrough */ case EOF: *s++ = d; n--; goto end; } *s++ = d + c; n--; break; } default: *s++ = c; n--; break; } } end: *s = '\0'; buf->len = s - buf->scratch; }
static int lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) { int neg = 0; fz_off_t i = 0; int n; int d; float v; float fd; /* Initially we might have +, -, . or a digit */ switch (c) { case '.': goto loop_after_dot; case '-': neg = 1; break; case '+': break; default: /* Must be a digit */ i = c - '0'; break; } while (1) { c = fz_read_byte(ctx, f); switch (c) { case '.': goto loop_after_dot; case RANGE_0_9: /* We deliberately ignore overflow here. We tried * code that returned INT_MIN/MAX as appropriate, * but this causes loss of data (see Bug695950.pdf * for an example). Tests show that Acrobat handles * overflows in exactly the same way we do (i.e. * 123450000000000000000678 is read as 678). */ i = 10*i + c - '0'; break; default: fz_unread_byte(ctx, f); /* Fallthrough */ case EOF: if (neg) i = -i; buf->i = i; return PDF_TOK_INT; } } /* In here, we've seen a dot, so can accept just digits */ loop_after_dot: n = 0; d = 1; while (1) { c = fz_read_byte(ctx, f); switch (c) { case RANGE_0_9: if (d >= INT_MAX/10) goto underflow; n = n*10 + (c - '0'); d *= 10; break; default: fz_unread_byte(ctx, f); /* Fallthrough */ case EOF: v = (float)i + ((float)n / (float)d); if (neg) v = -v; buf->f = v; return PDF_TOK_REAL; } } underflow: fd = 1 / (float)d; v = (float)i + ((float)n * fd); while (1) { c = fz_read_byte(ctx, f); switch (c) { case RANGE_0_9: fd /= 10; v += (c - '0') * fd; break; default: fz_unread_byte(ctx, f); /* Fallthrough */ case EOF: if (neg) v = -v; buf->f = v; return PDF_TOK_REAL; } } }
static void lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) { char *s = buf->scratch; int n = buf->size; while (n > 1) { int c = fz_read_byte(ctx, f); switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(ctx, f); goto end; case EOF: goto end; case '#': { int d; c = fz_read_byte(ctx, f); switch (c) { case RANGE_0_9: d = (c - '0') << 4; break; case RANGE_a_f: d = (c - 'a' + 10) << 4; break; case RANGE_A_F: d = (c - 'A' + 10) << 4; break; default: fz_unread_byte(ctx, f); /* fallthrough */ case EOF: goto end; } c = fz_read_byte(ctx, f); switch (c) { case RANGE_0_9: c -= '0'; break; case RANGE_a_f: c -= 'a' - 10; break; case RANGE_A_F: c -= 'A' - 10; break; default: fz_unread_byte(ctx, f); /* fallthrough */ case EOF: *s++ = d; n--; goto end; } *s++ = d + c; n--; break; } default: *s++ = c; n--; break; } } end: *s = '\0'; buf->len = s - buf->scratch; }
static int lex_number(fz_stream *f, char *s, int n, int *tok) { char *buf = s; *tok = PDF_TOK_INT; /* Initially we might have +, -, . or a digit */ if (n > 1) { int c = fz_read_byte(f); switch (c) { case '.': *tok = PDF_TOK_REAL; *s++ = c; n--; goto loop_after_dot; case '+': case '-': case RANGE_0_9: *s++ = c; n--; goto loop_after_sign; default: fz_unread_byte(f); goto end; case EOF: goto end; } } /* We can't accept a sign from here on in, just . or a digit */ loop_after_sign: while (n > 1) { int c = fz_read_byte(f); switch (c) { case '.': *tok = PDF_TOK_REAL; *s++ = c; n--; goto loop_after_dot; case RANGE_0_9: *s++ = c; break; default: fz_unread_byte(f); goto end; case EOF: goto end; } n--; } /* In here, we've seen a dot, so can accept just digits */ loop_after_dot: while (n > 1) { int c = fz_read_byte(f); switch (c) { case RANGE_0_9: *s++ = c; break; default: fz_unread_byte(f); goto end; case EOF: goto end; } n--; } end: *s = '\0'; return s-buf; }
fz_error pdf_lex(int *tok, fz_stream *f, char *buf, int n, int *sl) { while (1) { int c = fz_read_byte(f); switch (c) { case EOF: *tok = PDF_TOK_EOF; return fz_okay; case IS_WHITE: lex_white(f); break; case '%': lex_comment(f); break; case '/': lex_name(f, buf, n); *sl = strlen(buf); *tok = PDF_TOK_NAME; return fz_okay; case '(': *sl = lex_string(f, buf, n); *tok = PDF_TOK_STRING; return fz_okay; case ')': *tok = PDF_TOK_ERROR; goto cleanuperror; case '<': c = fz_read_byte(f); if (c == '<') { *tok = PDF_TOK_OPEN_DICT; } else { fz_unread_byte(f); *sl = lex_hex_string(f, buf, n); *tok = PDF_TOK_STRING; } return fz_okay; case '>': c = fz_read_byte(f); if (c == '>') { *tok = PDF_TOK_CLOSE_DICT; return fz_okay; } *tok = PDF_TOK_ERROR; goto cleanuperror; case '[': *tok = PDF_TOK_OPEN_ARRAY; return fz_okay; case ']': *tok = PDF_TOK_CLOSE_ARRAY; return fz_okay; case '{': *tok = PDF_TOK_OPEN_BRACE; return fz_okay; case '}': *tok = PDF_TOK_CLOSE_BRACE; return fz_okay; case IS_NUMBER: fz_unread_byte(f); *sl = lex_number(f, buf, n, tok); return fz_okay; default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(f); lex_name(f, buf, n); *sl = strlen(buf); *tok = pdf_token_from_keyword(buf); return fz_okay; } } cleanuperror: *tok = PDF_TOK_ERROR; return fz_throw("lexical error"); }
static int lex_string(fz_stream *f, char *buf, int n) { char *s = buf; char *e = buf + n; int bal = 1; int oct; int c; while (s < e) { c = fz_read_byte(f); switch (c) { case EOF: goto end; case '(': bal++; *s++ = c; break; case ')': bal --; if (bal == 0) goto end; *s++ = c; break; case '\\': c = fz_read_byte(f); switch (c) { case EOF: goto end; case 'n': *s++ = '\n'; break; case 'r': *s++ = '\r'; break; case 't': *s++ = '\t'; break; case 'b': *s++ = '\b'; break; case 'f': *s++ = '\f'; break; case '(': *s++ = '('; break; case ')': *s++ = ')'; break; case '\\': *s++ = '\\'; break; case RANGE_0_9: oct = c - '0'; c = fz_read_byte(f); if (c >= '0' && c <= '9') { oct = oct * 8 + (c - '0'); c = fz_read_byte(f); if (c >= '0' && c <= '9') oct = oct * 8 + (c - '0'); else if (c != EOF) fz_unread_byte(f); } else if (c != EOF) fz_unread_byte(f); *s++ = oct; break; case '\n': break; case '\r': c = fz_read_byte(f); if ((c != '\n') && (c != EOF)) fz_unread_byte(f); break; default: *s++ = c; } break; default: *s++ = c; break; } } end: return s - buf; }
static void lex_name(fz_stream *f, char *s, int n) { while (n > 1) { int c = fz_read_byte(f); switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(f); goto end; case EOF: goto end; case '#': { int d; c = fz_read_byte(f); switch (c) { case RANGE_0_9: d = (c - '0') << 4; break; case RANGE_a_f: d = (c - 'a' + 10) << 4; break; case RANGE_A_F: d = (c - 'A' + 10) << 4; break; default: fz_unread_byte(f); /* fallthrough */ case EOF: goto end; } c = fz_read_byte(f); switch (c) { case RANGE_0_9: c -= '0'; break; case RANGE_a_f: c -= 'a' - 10; break; case RANGE_A_F: c -= 'A' - 10; break; default: fz_unread_byte(f); /* fallthrough */ case EOF: *s++ = d; n--; goto end; } *s++ = d + c; n--; break; } default: *s++ = c; n--; break; } } end: *s = '\0'; }