static fz_error pdf_readoldtrailer(pdf_xref *xref, char *buf, int cap) { fz_error error; int len; char *s; int n; int t; pdf_token_e tok; int c; pdf_logxref("load old xref format trailer\n"); fz_readline(xref->file, buf, cap); if (strncmp(buf, "xref", 4) != 0) return fz_throw("cannot find xref marker"); while (1) { c = fz_peekbyte(xref->file); if (!(c >= '0' && c <= '9')) break; fz_readline(xref->file, buf, cap); s = buf; fz_strsep(&s, " "); /* ignore ofs */ if (!s) return fz_throw("invalid range marker in xref"); len = atoi(fz_strsep(&s, " ")); /* broken pdfs where the section is not on a separate line */ if (s && *s != '\0') fz_seek(xref->file, -(2 + (int)strlen(s)), 1); t = fz_tell(xref->file); if (t < 0) return fz_throw("cannot tell in file"); fz_seek(xref->file, t + 20 * len, 0); } error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TTRAILER) return fz_throw("expected trailer marker"); error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TODICT) return fz_throw("expected trailer dictionary"); error = pdf_parsedict(&xref->trailer, xref, xref->file, buf, cap); if (error) return fz_rethrow(error, "cannot parse trailer"); return fz_okay; }
static fz_error pdf_repairobjstm(pdf_xref *xref, int num, int gen) { fz_error error; fz_obj *obj; fz_stream *stm; int tok; int i, n, count; char buf[256]; error = pdf_loadobject(&obj, xref, num, gen); if (error) return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen); count = fz_toint(fz_dictgets(obj, "N")); fz_dropobj(obj); error = pdf_openstream(&stm, xref, num, gen); if (error) return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen); for (i = 0; i < count; i++) { error = pdf_lex(&tok, stm, buf, sizeof buf, &n); if (error || tok != PDF_TINT) { fz_close(stm); return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); } n = atoi(buf); if (n >= xref->len) pdf_resizexref(xref, n + 1); xref->table[n].ofs = num; xref->table[n].gen = i; xref->table[n].stmofs = 0; xref->table[n].obj = nil; xref->table[n].type = 'o'; error = pdf_lex(&tok, stm, buf, sizeof buf, &n); if (error || tok != PDF_TINT) { fz_close(stm); return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); } } fz_close(stm); return fz_okay; }
static float pdf_extract_font_size(pdf_xref *xref, char *appearance, char **font_name) { fz_context *ctx = xref->ctx; fz_stream *stream = fz_open_memory(ctx, appearance, strlen(appearance)); float font_size = 0; int tok, len; *font_name = NULL; do { fz_error error = pdf_lex(&tok, stream, xref->scratch, sizeof(xref->scratch), &len); if (error || tok == PDF_TOK_EOF) { fz_free(ctx, *font_name); *font_name = NULL; break; } if (tok == PDF_TOK_NAME) { fz_free(ctx, *font_name); *font_name = fz_strdup(ctx, xref->scratch); } else if (tok == PDF_TOK_REAL || tok == PDF_TOK_INT) { font_size = fz_atof(xref->scratch); } } while (tok != PDF_TOK_KEYWORD || strcmp(xref->scratch, "Tf") != 0); fz_close(stream); return font_size; }
fz_error pdf_parse_stm_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error; int tok; int len; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse token in object stream"); switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(op, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object stream"); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(op, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object stream"); break; case PDF_TOK_NAME: *op = fz_new_name(buf); break; case PDF_TOK_REAL: *op = fz_new_real(fz_atof(buf)); break; case PDF_TOK_STRING: *op = fz_new_string(buf, len); break; case PDF_TOK_TRUE: *op = fz_new_bool(1); break; case PDF_TOK_FALSE: *op = fz_new_bool(0); break; case PDF_TOK_NULL: *op = fz_new_null(); break; case PDF_TOK_INT: *op = fz_new_int(atoi(buf)); break; default: return fz_throw("unknown token in object stream"); } return fz_okay; }
pdf_obj * pdf_parse_stm_obj(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) { pdf_token tok; fz_context *ctx = file->ctx; tok = pdf_lex(file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: return pdf_parse_array(doc, file, buf); case PDF_TOK_OPEN_DICT: return pdf_parse_dict(doc, file, buf); case PDF_TOK_NAME: return pdf_new_name(doc, buf->scratch); break; case PDF_TOK_REAL: return pdf_new_real(doc, buf->f); break; case PDF_TOK_STRING: return pdf_new_string(doc, buf->scratch, buf->len); break; case PDF_TOK_TRUE: return pdf_new_bool(doc, 1); break; case PDF_TOK_FALSE: return pdf_new_bool(doc, 0); break; case PDF_TOK_NULL: return pdf_new_null(doc); break; case PDF_TOK_INT: return pdf_new_int(doc, buf->i); break; default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream"); } return NULL; /* Stupid MSVC */ }
fz_error pdf_parsestmobj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error; pdf_token_e tok; int len; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse token in object stream"); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(op, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object stream"); break; case PDF_TODICT: error = pdf_parsedict(op, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object stream"); break; case PDF_TNAME: *op = fz_newname(buf); break; case PDF_TREAL: *op = fz_newreal(atof(buf)); break; case PDF_TSTRING: *op = fz_newstring(buf, len); break; case PDF_TTRUE: *op = fz_newbool(1); break; case PDF_TFALSE: *op = fz_newbool(0); break; case PDF_TNULL: *op = fz_newnull(); break; case PDF_TINT: *op = fz_newint(atoi(buf)); break; default: return fz_throw("unknown token in object stream"); } return fz_okay; }
static int pdf_lex_cmap(fz_stream *file, pdf_lexbuf *buf) { int tok = pdf_lex(file, buf); /* RJW: Lost debugging here: "cannot parse cmap token" */ if (tok == PDF_TOK_KEYWORD) tok = pdf_cmap_token_from_keyword(buf->scratch); return tok; }
static int pdf_lex_cmap(fz_stream *file, char *buf, int n, int *sl) { int tok = pdf_lex(file, buf, n, sl); /* RJW: Lost debugging here: "cannot parse cmap token" */ if (tok == PDF_TOK_KEYWORD) tok = pdf_cmap_token_from_keyword(buf); return tok; }
static fz_error lexcmap(pdf_token_e *tok, fz_stream *file, char *buf, int n, int *sl) { fz_error error; error = pdf_lex(tok, file, buf, n, sl); if (error) return fz_rethrow(error, "cannot parse cmap token"); if (*tok == PDF_TKEYWORD) *tok = pdf_cmaptokenfromkeyword(buf); return fz_okay; }
pdf_obj * pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) { int tok; fz_context *ctx = file->ctx; tok = pdf_lex(file, buf); /* RJW: "cannot parse token in object stream") */ switch (tok) { case PDF_TOK_OPEN_ARRAY: return pdf_parse_array(xref, file, buf); /* RJW: "cannot parse object stream" */ case PDF_TOK_OPEN_DICT: return pdf_parse_dict(xref, file, buf); /* RJW: "cannot parse object stream" */ case PDF_TOK_NAME: return fz_new_name(ctx, buf->scratch); break; case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break; case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break; case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break; case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break; case PDF_TOK_NULL: return pdf_new_null(ctx); break; case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break; default: fz_throw(ctx, "unknown token in object stream"); } return NULL; /* Stupid MSVC */ }
fz_error * pdf_parsestmobj(fz_obj **op, fz_stream *file, char *buf, int cap) { int tok, len; tok = pdf_lex(file, buf, cap, &len); switch (tok) { case PDF_TOARRAY: return pdf_parsearray(op, file, buf, cap); case PDF_TODICT: return pdf_parsedict(op, file, buf, cap); case PDF_TNAME: return fz_newname(op, buf); case PDF_TREAL: return fz_newreal(op, atof(buf)); case PDF_TSTRING: return fz_newstring(op, buf, len); case PDF_TTRUE: return fz_newbool(op, 1); case PDF_TFALSE: return fz_newbool(op, 0); case PDF_TNULL: return fz_newnull(op); case PDF_TINT: return fz_newint(op, atoi(buf)); } return fz_throw("syntaxerror: corrupt object stream"); }
static fz_error pdf_readoldxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) { fz_error error; int ofs, len; char *s; int n; pdf_token_e tok; int i; int c; pdf_logxref("load old xref format\n"); fz_readline(xref->file, buf, cap); if (strncmp(buf, "xref", 4) != 0) return fz_throw("cannot find xref marker"); while (1) { c = fz_peekbyte(xref->file); if (!(c >= '0' && c <= '9')) break; fz_readline(xref->file, buf, cap); s = buf; ofs = atoi(fz_strsep(&s, " ")); len = atoi(fz_strsep(&s, " ")); /* broken pdfs where the section is not on a separate line */ if (s && *s != '\0') { fz_warn("broken xref section. proceeding anyway."); fz_seek(xref->file, -(2 + (int)strlen(s)), 1); } /* broken pdfs where size in trailer undershoots entries in xref sections */ if (ofs + len > xref->cap) { fz_warn("broken xref section, proceeding anyway."); xref->cap = ofs + len; xref->table = fz_realloc(xref->table, xref->cap * sizeof(pdf_xrefentry)); } if ((ofs + len) > xref->len) { for (i = xref->len; i < (ofs + len); i++) { xref->table[i].ofs = 0; xref->table[i].gen = 0; xref->table[i].stmofs = 0; xref->table[i].obj = nil; xref->table[i].type = 0; } xref->len = ofs + len; } for (i = ofs; i < ofs + len; i++) { n = fz_read(xref->file, (unsigned char *) buf, 20); if (n < 0) return fz_rethrow(n, "cannot read xref table"); if (!xref->table[i].type) { s = buf; /* broken pdfs where line start with white space */ while (*s != '\0' && iswhite(*s)) s++; xref->table[i].ofs = atoi(s); xref->table[i].gen = atoi(s + 11); xref->table[i].type = s[17]; } } } error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TTRAILER) return fz_throw("expected trailer marker"); error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TODICT) return fz_throw("expected trailer dictionary"); error = pdf_parsedict(trailerp, xref, xref->file, buf, cap); if (error) return fz_rethrow(error, "cannot parse trailer"); return fz_okay; }
fz_error pdf_parseindobj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap, int *onum, int *ogen, int *ostmofs) { fz_error error = fz_okay; fz_obj *obj = nil; int num = 0, gen = 0, stmofs; pdf_token_e tok; int len; int a, b; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TINT) return fz_throw("cannot parse indirect object (%d %d R)", num, gen); num = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TINT) return fz_throw("cannot parse indirect object (%d %d R)", num, gen); gen = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOBJ) return fz_throw("cannot parse indirect object (%d %d R)", num, gen); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TODICT: error = pdf_parsedict(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TNAME: obj = fz_newname(buf); break; case PDF_TREAL: obj = fz_newreal(atof(buf)); break; case PDF_TSTRING: obj = fz_newstring(buf, len); break; case PDF_TTRUE: obj = fz_newbool(1); break; case PDF_TFALSE: obj = fz_newbool(0); break; case PDF_TNULL: obj = fz_newnull(); break; case PDF_TINT: a = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TSTREAM || tok == PDF_TENDOBJ) { obj = fz_newint(a); goto skip; } if (tok == PDF_TINT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TR) { obj = fz_newindirect(a, b, xref); break; } } return fz_throw("cannot parse indirect object (%d %d R)", num, gen); case PDF_TENDOBJ: obj = fz_newnull(); goto skip; default: return fz_throw("cannot parse indirect object (%d %d R)", num, gen); } error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(obj); return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TSTREAM) { int c = fz_readbyte(file); while (c == ' ') c = fz_readbyte(file); if (c == '\r') { c = fz_peekbyte(file); if (c != '\n') fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen); else fz_readbyte(file); } stmofs = fz_tell(file); } else if (tok == PDF_TENDOBJ) { stmofs = 0; } else { fz_warn("expected endobj or stream keyword (%d %d R)", num, gen); stmofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stmofs; *op = obj; return fz_okay; }
static fz_error fz_repairobj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id) { fz_error error; int tok; int stmlen; int len; int n; *stmofsp = 0; *stmlenp = -1; stmlen = 0; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse object"); if (tok == PDF_TODICT) { fz_obj *dict, *obj; /* Send nil xref so we don't try to resolve references */ error = pdf_parsedict(&dict, nil, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object"); obj = fz_dictgets(dict, "Type"); if (fz_isname(obj) && !strcmp(fz_toname(obj), "XRef")) { obj = fz_dictgets(dict, "Encrypt"); if (obj) { if (*encrypt) fz_dropobj(*encrypt); *encrypt = fz_keepobj(obj); } obj = fz_dictgets(dict, "ID"); if (obj) { if (*id) fz_dropobj(*id); *id = fz_keepobj(obj); } } obj = fz_dictgets(dict, "Length"); if (fz_isint(obj)) stmlen = fz_toint(obj); fz_dropobj(dict); } while ( tok != PDF_TSTREAM && tok != PDF_TENDOBJ && tok != PDF_TERROR && tok != PDF_TEOF ) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot scan for endobj or stream token"); } if (tok == PDF_TSTREAM) { int c = fz_readbyte(file); if (c == '\r') { c = fz_peekbyte(file); if (c == '\n') fz_readbyte(file); } *stmofsp = fz_tell(file); if (*stmofsp < 0) return fz_throw("cannot seek in file"); if (stmlen > 0) { fz_seek(file, *stmofsp + stmlen, 0); error = pdf_lex(&tok, file, buf, cap, &len); if (error) fz_catch(error, "cannot find endstream token, falling back to scanning"); if (tok == PDF_TENDSTREAM) goto atobjend; fz_seek(file, *stmofsp, 0); } n = fz_read(file, (unsigned char *) buf, 9); if (n < 0) return fz_rethrow(n, "cannot read from file"); while (memcmp(buf, "endstream", 9) != 0) { c = fz_readbyte(file); if (c == EOF) break; memmove(buf, buf + 1, 8); buf[8] = c; } *stmlenp = fz_tell(file) - *stmofsp - 9; atobjend: error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot scan for endobj token"); if (tok != PDF_TENDOBJ) fz_warn("object missing 'endobj' token"); } return fz_okay; }
fz_error pdf_parsedict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; pdf_token_e tok; int len; int a, b; dict = fz_newdict(8); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } skip: if (tok == PDF_TCDICT) { *op = dict; return fz_okay; } /* for BI .. ID .. EI in content streams */ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID")) { *op = dict; return fz_okay; } if (tok != PDF_TNAME) { fz_dropobj(dict); return fz_throw("invalid key in dict");; } key = fz_newname(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&val, xref, file, buf, cap); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TODICT: error = pdf_parsedict(&val, xref, file, buf, cap); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TNAME: val = fz_newname(buf); break; case PDF_TREAL: val = fz_newreal(atof(buf)); break; case PDF_TSTRING: val = fz_newstring(buf, len); break; case PDF_TTRUE: val = fz_newbool(1); break; case PDF_TFALSE: val = fz_newbool(0); break; case PDF_TNULL: val = fz_newnull(); break; case PDF_TINT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = (int) strtoll(buf, 0, 10); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TCDICT || tok == PDF_TNAME || (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { val = fz_newint(a); fz_dictput(dict, key, val); fz_dropobj(val); fz_dropobj(key); goto skip; } if (tok == PDF_TINT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TR) { val = fz_newindirect(a, b, xref); break; } } fz_dropobj(key); fz_dropobj(dict); return fz_throw("invalid indirect reference in dict"); default: return fz_throw("unknown token in dict"); } fz_dictput(dict, key, val); fz_dropobj(val); fz_dropobj(key); } }
fz_error pdf_parsearray(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *ary = nil; fz_obj *obj = nil; int a = 0, b = 0, n = 0; pdf_token_e tok; int len; ary = fz_newarray(4); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(ary); return fz_rethrow(error, "cannot parse array"); } if (tok != PDF_TINT && tok != PDF_TR) { if (n > 0) { obj = fz_newint(a); fz_arraypush(ary, obj); fz_dropobj(obj); } if (n > 1) { obj = fz_newint(b); fz_arraypush(ary, obj); fz_dropobj(obj); } n = 0; } if (tok == PDF_TINT && n == 2) { obj = fz_newint(a); fz_arraypush(ary, obj); fz_dropobj(obj); a = b; n --; } switch (tok) { case PDF_TCARRAY: *op = ary; return fz_okay; case PDF_TINT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TR: if (n != 2) { fz_dropobj(ary); return fz_throw("cannot parse indirect reference in array"); } obj = fz_newindirect(a, b, xref); fz_arraypush(ary, obj); fz_dropobj(obj); n = 0; break; case PDF_TOARRAY: error = pdf_parsearray(&obj, xref, file, buf, cap); if (error) { fz_dropobj(ary); return fz_rethrow(error, "cannot parse array"); } fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TODICT: error = pdf_parsedict(&obj, xref, file, buf, cap); if (error) { fz_dropobj(ary); return fz_rethrow(error, "cannot parse array"); } fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TNAME: obj = fz_newname(buf); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TREAL: obj = fz_newreal(atof(buf)); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TSTRING: obj = fz_newstring(buf, len); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TTRUE: obj = fz_newbool(1); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TFALSE: obj = fz_newbool(0); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TNULL: obj = fz_newnull(); fz_arraypush(ary, obj); fz_dropobj(obj); break; default: fz_dropobj(ary); return fz_throw("cannot parse token in array"); } } }
fz_error pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) { fz_error error; fz_obj *dict, *obj; fz_obj *length; fz_obj *encrypt = nil; fz_obj *id = nil; fz_obj *root = nil; fz_obj *info = nil; struct entry *list = nil; int listlen; int listcap; int maxnum = 0; int num = 0; int gen = 0; int tmpofs, numofs = 0, genofs = 0; int stmlen, stmofs = 0; int tok; int next; int i, n; pdf_logxref("repairxref %p\n", xref); fz_seek(xref->file, 0, 0); listlen = 0; listcap = 1024; list = fz_calloc(listcap, sizeof(struct entry)); /* look for '%PDF' version marker within first kilobyte of file */ n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024)); if (n < 0) { error = fz_rethrow(n, "cannot read from file"); goto cleanup; } fz_seek(xref->file, 0, 0); for (i = 0; i < n - 4; i++) { if (memcmp(buf + i, "%PDF", 4) == 0) { fz_seek(xref->file, i, 0); break; } } while (1) { tmpofs = fz_tell(xref->file); if (tmpofs < 0) { error = fz_throw("cannot tell in file"); goto cleanup; } error = pdf_lex(&tok, xref->file, buf, bufsize, &n); if (error) { fz_catch(error, "ignoring the rest of the file"); break; } if (tok == PDF_TINT) { numofs = genofs; num = gen; genofs = tmpofs; gen = atoi(buf); } if (tok == PDF_TOBJ) { error = fz_repairobj(xref->file, buf, bufsize, &stmofs, &stmlen, &encrypt, &id); if (error) { error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen); goto cleanup; } pdf_logxref("found object: (%d %d R)\n", num, gen); if (listlen + 1 == listcap) { listcap = (listcap * 3) / 2; list = fz_realloc(list, listcap, sizeof(struct entry)); } list[listlen].num = num; list[listlen].gen = gen; list[listlen].ofs = numofs; list[listlen].stmofs = stmofs; list[listlen].stmlen = stmlen; listlen ++; if (num > maxnum) maxnum = num; } /* trailer dictionary */ if (tok == PDF_TODICT) { error = pdf_parsedict(&dict, xref, xref->file, buf, bufsize); if (error) { error = fz_rethrow(error, "cannot parse object"); goto cleanup; } obj = fz_dictgets(dict, "Encrypt"); if (obj) { if (encrypt) fz_dropobj(encrypt); encrypt = fz_keepobj(obj); } obj = fz_dictgets(dict, "ID"); if (obj) { if (id) fz_dropobj(id); id = fz_keepobj(obj); } obj = fz_dictgets(dict, "Root"); if (obj) { if (root) fz_dropobj(root); root = fz_keepobj(obj); } obj = fz_dictgets(dict, "Info"); if (obj) { if (info) fz_dropobj(info); info = fz_keepobj(obj); } fz_dropobj(dict); } if (tok == PDF_TERROR) fz_readbyte(xref->file); if (tok == PDF_TEOF) break; } /* make xref reasonable */ pdf_resizexref(xref, maxnum + 1); for (i = 0; i < listlen; i++) { xref->table[list[i].num].type = 'n'; xref->table[list[i].num].ofs = list[i].ofs; xref->table[list[i].num].gen = list[i].gen; xref->table[list[i].num].stmofs = list[i].stmofs; /* corrected stream length */ if (list[i].stmlen >= 0) { pdf_logxref("correct stream length %d %d = %d\n", list[i].num, list[i].gen, list[i].stmlen); error = pdf_loadobject(&dict, xref, list[i].num, list[i].gen); if (error) { error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen); goto cleanup; } length = fz_newint(list[i].stmlen); fz_dictputs(dict, "Length", length); fz_dropobj(length); fz_dropobj(dict); } } xref->table[0].type = 'f'; xref->table[0].ofs = 0; xref->table[0].gen = 65535; xref->table[0].stmofs = 0; xref->table[0].obj = nil; next = 0; for (i = xref->len - 1; i >= 0; i--) { if (xref->table[i].type == 'f') { xref->table[i].ofs = next; if (xref->table[i].gen < 65535) xref->table[i].gen ++; next = i; } } /* create a repaired trailer, Root will be added later */ xref->trailer = fz_newdict(5); obj = fz_newint(maxnum + 1); fz_dictputs(xref->trailer, "Size", obj); fz_dropobj(obj); if (root) { fz_dictputs(xref->trailer, "Root", root); fz_dropobj(root); } if (info) { fz_dictputs(xref->trailer, "Info", info); fz_dropobj(info); } if (encrypt) { if (fz_isindirect(encrypt)) { /* create new reference with non-nil xref pointer */ obj = fz_newindirect(fz_tonum(encrypt), fz_togen(encrypt), xref); fz_dropobj(encrypt); encrypt = obj; } fz_dictputs(xref->trailer, "Encrypt", encrypt); fz_dropobj(encrypt); } if (id) { if (fz_isindirect(id)) { /* create new reference with non-nil xref pointer */ obj = fz_newindirect(fz_tonum(id), fz_togen(id), xref); fz_dropobj(id); id = obj; } fz_dictputs(xref->trailer, "ID", id); fz_dropobj(id); } fz_free(list); return fz_okay; cleanup: if (encrypt) fz_dropobj(encrypt); if (id) fz_dropobj(id); if (root) fz_dropobj(root); if (info) fz_dropobj(info); fz_free(list); return error; /* already rethrown */ }
pdf_obj * pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf, int *onum, int *ogen, fz_off_t *ostmofs, int *try_repair) { pdf_obj *obj = NULL; int num = 0, gen = 0; fz_off_t stm_ofs; pdf_token tok; fz_off_t a, b; fz_var(obj); tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_INT) { if (try_repair) *try_repair = 1; fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number"); } num = buf->i; tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_INT) { if (try_repair) *try_repair = 1; fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num); } gen = buf->i; tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_OBJ) { if (try_repair) *try_repair = 1; fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen); } tok = pdf_lex(ctx, file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(ctx, doc, file, buf); break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(ctx, doc, file, buf); break; case PDF_TOK_NAME: obj = pdf_new_name(ctx, doc, buf->scratch); break; case PDF_TOK_REAL: obj = pdf_new_real(ctx, doc, buf->f); break; case PDF_TOK_STRING: obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); break; case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, doc, 1); break; case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, doc, 0); break; case PDF_TOK_NULL: obj = pdf_new_null(ctx, doc); break; case PDF_TOK_INT: a = buf->i; tok = pdf_lex(ctx, file, buf); if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = pdf_new_int_offset(ctx, doc, a); goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(ctx, file, buf); if (tok == PDF_TOK_R) { obj = pdf_new_indirect(ctx, doc, a, b); break; } } fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = pdf_new_null(ctx, doc); goto skip; default: fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen); } fz_try(ctx) { tok = pdf_lex(ctx, file, buf); } fz_catch(ctx) { pdf_drop_obj(ctx, obj); fz_rethrow(ctx); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(ctx, file); while (c == ' ') c = fz_read_byte(ctx, file); if (c == '\r') { c = fz_peek_byte(ctx, file); if (c != '\n') fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(ctx, file); } stm_ofs = fz_tell(ctx, file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; return obj; }
fz_error pdf_loadobjstm(pdf_xref *xref, int oid, int gen, char *buf, int cap) { fz_error error; fz_stream *stm; fz_obj *objstm; int *oidbuf; int *ofsbuf; fz_obj *obj; int first; int count; int i, n; pdf_token_e tok; pdf_logxref("loadobjstm (%d %d R)\n", oid, gen); error = pdf_loadobject(&objstm, xref, oid, gen); if (error) return fz_rethrow(error, "cannot load object stream object"); count = fz_toint(fz_dictgets(objstm, "N")); first = fz_toint(fz_dictgets(objstm, "First")); pdf_logxref(" count %d\n", count); oidbuf = fz_malloc(count * sizeof(int)); if (!oidbuf) { error = fz_rethrow(-1, "out of memory: object id buffer"); goto cleanupobj; } ofsbuf = fz_malloc(count * sizeof(int)); if (!ofsbuf) { error = fz_rethrow(-1, "out of memory: offset buffer"); goto cleanupoid; } error = pdf_openstream(&stm, xref, oid, gen); if (error) { error = fz_rethrow(error, "cannot open object stream"); goto cleanupofs; } for (i = 0; i < count; i++) { error = pdf_lex(&tok, stm, buf, cap, &n); if (error || tok != PDF_TINT) { error = fz_rethrow(error, "corrupt object stream"); goto cleanupstm; } oidbuf[i] = atoi(buf); error = pdf_lex(&tok, stm, buf, cap, &n); if (error || tok != PDF_TINT) { error = fz_rethrow(error, "corrupt object stream"); goto cleanupstm; } ofsbuf[i] = atoi(buf); } error = fz_seek(stm, first, 0); if (error) { error = fz_rethrow(error, "cannot seek in object stream"); goto cleanupstm; } for (i = 0; i < count; i++) { /* FIXME: seek to first + ofsbuf[i] */ error = pdf_parsestmobj(&obj, xref, stm, buf, cap); if (error) { error = fz_rethrow(error, "cannot parse object %d in stream", i); goto cleanupstm; } if (oidbuf[i] < 1 || oidbuf[i] >= xref->len) { fz_dropobj(obj); error = fz_throw("object id (%d 0 R) out of range (0..%d)", oidbuf[i], xref->len - 1); goto cleanupstm; } if (xref->table[oidbuf[i]].obj) fz_dropobj(xref->table[oidbuf[i]].obj); xref->table[oidbuf[i]].obj = obj; } fz_dropstream(stm); fz_free(ofsbuf); fz_free(oidbuf); fz_dropobj(objstm); return fz_okay; cleanupstm: fz_dropstream(stm); cleanupofs: fz_free(ofsbuf); cleanupoid: fz_free(oidbuf); cleanupobj: fz_dropobj(objstm); return error; /* already rethrown */ }
pdf_obj * pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) { pdf_obj *ary = NULL; pdf_obj *obj = NULL; int a = 0, b = 0, n = 0; int tok; fz_context *ctx = file->ctx; pdf_obj *op; fz_var(obj); ary = pdf_new_array(ctx, 4); fz_try(ctx) { while (1) { tok = pdf_lex(file, buf); if (tok != PDF_TOK_INT && tok != PDF_TOK_R) { if (n > 0) { obj = pdf_new_int(ctx, a); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; } if (n > 1) { obj = pdf_new_int(ctx, b); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; } n = 0; } if (tok == PDF_TOK_INT && n == 2) { obj = pdf_new_int(ctx, a); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; a = b; n --; } switch (tok) { case PDF_TOK_CLOSE_ARRAY: op = ary; goto end; case PDF_TOK_INT: if (n == 0) a = buf->i; if (n == 1) b = buf->i; n ++; break; case PDF_TOK_R: if (n != 2) fz_throw(ctx, "cannot parse indirect reference in array"); obj = pdf_new_indirect(ctx, a, b, xref); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; n = 0; break; case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(xref, file, buf); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(xref, file, buf); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_NAME: obj = fz_new_name(ctx, buf->scratch); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_NULL: obj = pdf_new_null(ctx); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; default: fz_throw(ctx, "cannot parse token in array"); } } end: {} } fz_catch(ctx) { pdf_drop_obj(obj); pdf_drop_obj(ary); fz_throw(ctx, "cannot parse array"); } return op; }
fz_error * pdf_parsearray(fz_obj **op, fz_stream *file, char *buf, int cap) { fz_error *error = nil; fz_obj *ary = nil; fz_obj *obj = nil; int a = 0, b = 0, n = 0; int tok, len; error = fz_newarray(op, 4); if (error) return error; ary = *op; while (1) { tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TINT && tok != PDF_TR) { if (n > 0) { error = fz_newint(&obj, a); if (error) goto cleanup; error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); obj = nil; } if (n > 1) { error = fz_newint(&obj, b); if (error) goto cleanup; error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); obj = nil; } n = 0; } if (tok == PDF_TINT && n == 2) { error = fz_newint(&obj, a); if (error) goto cleanup; error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); obj = nil; a = b; n --; } switch (tok) { case PDF_TCARRAY: return nil; case PDF_TINT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TR: if (n != 2) goto cleanup; error = fz_newindirect(&obj, a, b); if (error) goto cleanup; n = 0; break; case PDF_TOARRAY: error = pdf_parsearray(&obj, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&obj, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&obj, buf); break; case PDF_TREAL: error = fz_newreal(&obj, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&obj, buf, len); break; case PDF_TTRUE: error = fz_newbool(&obj, 1); break; case PDF_TFALSE: error = fz_newbool(&obj, 0); break; case PDF_TNULL: error = fz_newnull(&obj); break; default: goto cleanup; } if (error) goto cleanup; if (obj) { error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); } obj = nil; } cleanup: if (obj) fz_dropobj(obj); if (ary) fz_dropobj(ary); if (error) return error; return fz_throw("syntaxerror: corrupt array"); }
fz_error * pdf_parseindobj(fz_obj **op, fz_stream *file, char *buf, int cap, int *ooid, int *ogid, int *ostmofs) { fz_error *error = nil; fz_obj *obj = nil; int oid = 0, gid = 0, stmofs; int tok, len; int a, b; tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TINT) goto cleanup; oid = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TINT) goto cleanup; gid = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TOBJ) goto cleanup; tok = pdf_lex(file, buf, cap, &len); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&obj, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&obj, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&obj, buf); break; case PDF_TREAL: error = fz_newreal(&obj, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&obj, buf, len); break; case PDF_TTRUE: error = fz_newbool(&obj, 1); break; case PDF_TFALSE: error = fz_newbool(&obj, 0); break; case PDF_TNULL: error = fz_newnull(&obj); break; case PDF_TINT: a = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TSTREAM || tok == PDF_TENDOBJ) { error = fz_newint(&obj, a); if (error) goto cleanup; goto skip; } if (tok == PDF_TINT) { b = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TR) { error = fz_newindirect(&obj, a, b); break; } } goto cleanup; default: goto cleanup; } if (error) goto cleanup; tok = pdf_lex(file, buf, cap, &len); skip: if (tok == PDF_TSTREAM) { int c = fz_readbyte(file); if (c == '\r') { c = fz_peekbyte(file); if (c != '\n') fz_warn("syntaxerror: DOS format line ending after stream keyword (%d %d)\n", oid, gid); else c = fz_readbyte(file); } stmofs = fz_tell(file); } else if (tok == PDF_TENDOBJ) stmofs = 0; else goto cleanup; if (ooid) *ooid = oid; if (ogid) *ogid = gid; if (ostmofs) *ostmofs = stmofs; *op = obj; return nil; cleanup: if (obj) fz_dropobj(obj); if (error) return error; return fz_throw("syntaxerror: corrupt indirect object (%d %d)", oid, gid); }
static fz_error pdf_loadobjstm(pdf_xref *xref, int num, int gen, char *buf, int cap) { fz_error error; fz_stream *stm; fz_obj *objstm; int *numbuf; int *ofsbuf; fz_obj *obj; int first; int count; int i, n; pdf_token_e tok; pdf_logxref("loadobjstm (%d %d R)\n", num, gen); error = pdf_loadobject(&objstm, xref, num, gen); if (error) return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen); count = fz_toint(fz_dictgets(objstm, "N")); first = fz_toint(fz_dictgets(objstm, "First")); pdf_logxref("\tcount %d\n", count); numbuf = fz_malloc(count * sizeof(int)); ofsbuf = fz_malloc(count * sizeof(int)); error = pdf_openstream(&stm, xref, num, gen); if (error) { error = fz_rethrow(error, "cannot open object stream (%d %d R)", num, gen); goto cleanupbuf; } for (i = 0; i < count; i++) { error = pdf_lex(&tok, stm, buf, cap, &n); if (error || tok != PDF_TINT) { error = fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); goto cleanupstm; } numbuf[i] = atoi(buf); error = pdf_lex(&tok, stm, buf, cap, &n); if (error || tok != PDF_TINT) { error = fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); goto cleanupstm; } ofsbuf[i] = atoi(buf); } fz_seek(stm, first, 0); for (i = 0; i < count; i++) { fz_seek(stm, first + ofsbuf[i], 0); error = pdf_parsestmobj(&obj, xref, stm, buf, cap); if (error) { error = fz_rethrow(error, "cannot parse object %d in stream (%d %d R)", i, num, gen); goto cleanupstm; } if (numbuf[i] < 1 || numbuf[i] >= xref->len) { fz_dropobj(obj); error = fz_throw("object id (%d 0 R) out of range (0..%d)", numbuf[i], xref->len - 1); goto cleanupstm; } if (xref->table[numbuf[i]].type == 'o' && xref->table[numbuf[i]].ofs == num) { if (xref->table[numbuf[i]].obj) fz_dropobj(xref->table[numbuf[i]].obj); xref->table[numbuf[i]].obj = obj; } else { fz_dropobj(obj); } } fz_close(stm); fz_free(ofsbuf); fz_free(numbuf); fz_dropobj(objstm); return fz_okay; cleanupstm: fz_close(stm); cleanupbuf: fz_free(ofsbuf); fz_free(numbuf); fz_dropobj(objstm); return error; /* already rethrown */ }
pdf_obj * pdf_parse_dict(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) { pdf_obj *dict; pdf_obj *key = NULL; pdf_obj *val = NULL; pdf_token tok; int a, b; fz_context *ctx = file->ctx; dict = pdf_new_dict(doc, 8); fz_var(key); fz_var(val); fz_try(ctx) { while (1) { tok = pdf_lex(file, buf); skip: if (tok == PDF_TOK_CLOSE_DICT) break; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")) break; if (tok != PDF_TOK_NAME) fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict"); key = pdf_new_name(doc, buf->scratch); tok = pdf_lex(file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: val = pdf_parse_array(doc, file, buf); break; case PDF_TOK_OPEN_DICT: val = pdf_parse_dict(doc, file, buf); break; case PDF_TOK_NAME: val = pdf_new_name(doc, buf->scratch); break; case PDF_TOK_REAL: val = pdf_new_real(doc, buf->f); break; case PDF_TOK_STRING: val = pdf_new_string(doc, buf->scratch, buf->len); break; case PDF_TOK_TRUE: val = pdf_new_bool(doc, 1); break; case PDF_TOK_FALSE: val = pdf_new_bool(doc, 0); break; case PDF_TOK_NULL: val = pdf_new_null(doc); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) { val = pdf_new_int(doc, a); pdf_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_R) { val = pdf_new_indirect(doc, a, b); break; } } fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict"); default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict"); } pdf_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; } } fz_catch(ctx) { pdf_drop_obj(dict); pdf_drop_obj(key); pdf_drop_obj(val); fz_rethrow_message(ctx, "cannot parse dict"); } return dict; }
static fz_error pdf_read_old_xref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) { fz_error error; int ofs, len; char *s; int n; int tok; int i; int c; fz_read_line(xref->file, buf, cap); if (strncmp(buf, "xref", 4) != 0) return fz_throw("cannot find xref marker"); while (1) { c = fz_peek_byte(xref->file); if (!(c >= '0' && c <= '9')) break; fz_read_line(xref->file, buf, cap); s = buf; ofs = atoi(fz_strsep(&s, " ")); len = atoi(fz_strsep(&s, " ")); /* broken pdfs where the section is not on a separate line */ if (s && *s != '\0') { fz_warn("broken xref section. proceeding anyway."); fz_seek(xref->file, -(2 + (int)strlen(s)), 1); } /* broken pdfs where size in trailer undershoots entries in xref sections */ if (ofs + len > xref->len) { fz_warn("broken xref section, proceeding anyway."); pdf_resize_xref(xref, ofs + len); } for (i = ofs; i < ofs + len; i++) { n = fz_read(xref->file, (unsigned char *) buf, 20); if (n < 0) return fz_rethrow(n, "cannot read xref table"); if (!xref->table[i].type) { s = buf; /* broken pdfs where line start with white space */ while (*s != '\0' && iswhite(*s)) s++; xref->table[i].ofs = atoi(s); xref->table[i].gen = atoi(s + 11); xref->table[i].type = s[17]; if (s[17] != 'f' && s[17] != 'n' && s[17] != 'o') return fz_throw("unexpected xref type: %#x (%d %d R)", s[17], i, xref->table[i].gen); } } } error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TOK_TRAILER) return fz_throw("expected trailer marker"); error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TOK_OPEN_DICT) return fz_throw("expected trailer dictionary"); error = pdf_parse_dict(trailerp, xref, xref->file, buf, cap); if (error) return fz_rethrow(error, "cannot parse trailer"); return fz_okay; }
fz_error * pdf_parsedict(fz_obj **op, fz_stream *file, char *buf, int cap) { fz_error *error = nil; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; int tok, len; int a, b; error = fz_newdict(op, 8); if (error) return error; dict = *op; while (1) { tok = pdf_lex(file, buf, cap, &len); skip: if (tok == PDF_TCDICT) return nil; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID")) return nil; if (tok != PDF_TNAME) goto cleanup; error = fz_newname(&key, buf); if (error) goto cleanup; tok = pdf_lex(file, buf, cap, &len); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&val, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&val, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&val, buf); break; case PDF_TREAL: error = fz_newreal(&val, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&val, buf, len); break; case PDF_TTRUE: error = fz_newbool(&val, 1); break; case PDF_TFALSE: error = fz_newbool(&val, 0); break; case PDF_TNULL: error = fz_newnull(&val); break; case PDF_TINT: a = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TCDICT || tok == PDF_TNAME || (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { error = fz_newint(&val, a); if (error) goto cleanup; error = fz_dictput(dict, key, val); if (error) goto cleanup; fz_dropobj(val); fz_dropobj(key); key = val = nil; goto skip; } if (tok == PDF_TINT) { b = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TR) { error = fz_newindirect(&val, a, b); break; } } goto cleanup; default: goto cleanup; } if (error) goto cleanup; error = fz_dictput(dict, key, val); if (error) goto cleanup; fz_dropobj(val); fz_dropobj(key); key = val = nil; } cleanup: if (key) fz_dropobj(key); if (val) fz_dropobj(val); if (dict) fz_dropobj(dict); if (error) return error; return fz_throw("syntaxerror: corrupt dictionary"); }
pdf_obj * pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) { pdf_obj *dict; pdf_obj *key = NULL; pdf_obj *val = NULL; int tok; int a, b; fz_context *ctx = file->ctx; dict = pdf_new_dict(ctx, 8); fz_var(key); fz_var(val); fz_try(ctx) { while (1) { tok = pdf_lex(file, buf); skip: if (tok == PDF_TOK_CLOSE_DICT) break; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")) break; if (tok != PDF_TOK_NAME) fz_throw(ctx, "invalid key in dict"); key = fz_new_name(ctx, buf->scratch); tok = pdf_lex(file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: /* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1643 */ fz_try(ctx) { val = pdf_parse_array(xref, file, buf); } fz_catch(ctx) { fz_warn(ctx, "ignoring broken array for '%s'", pdf_to_name(key)); pdf_drop_obj(key); val = key = NULL; do tok = pdf_lex(file, buf); while (tok != PDF_TOK_CLOSE_DICT && tok != PDF_TOK_CLOSE_ARRAY && tok != PDF_TOK_EOF && tok != PDF_TOK_OPEN_ARRAY && tok != PDF_TOK_OPEN_DICT); if (tok == PDF_TOK_CLOSE_DICT) goto skip; if (tok == PDF_TOK_CLOSE_ARRAY) continue; fz_throw(ctx, "cannot make sense of broken array after all"); } break; case PDF_TOK_OPEN_DICT: val = pdf_parse_dict(xref, file, buf); break; case PDF_TOK_NAME: val = fz_new_name(ctx, buf->scratch); break; case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break; case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break; case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break; case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break; case PDF_TOK_NULL: val = pdf_new_null(ctx); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) { val = pdf_new_int(ctx, a); fz_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_R) { val = pdf_new_indirect(ctx, a, b, xref); break; } } fz_throw(ctx, "invalid indirect reference in dict"); default: fz_throw(ctx, "unknown token in dict"); } fz_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; } } fz_catch(ctx) { pdf_drop_obj(dict); pdf_drop_obj(key); pdf_drop_obj(val); fz_throw(ctx, "cannot parse dict"); } return dict; }
static fz_error parsecode(pdf_function *func, fz_stream *stream, int *codeptr) { fz_error error; char buf[64]; int len; pdf_token_e tok; int opptr, elseptr; int a, b, mid, cmp; memset(buf, 0, sizeof(buf)); while (1) { error = pdf_lex(&tok, stream, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "calculator function lexical error"); switch(tok) { case PDF_TEOF: return fz_throw("truncated calculator function"); case PDF_TINT: error = resizecode(func, *codeptr); if (error) return fz_rethrow(error, "resize calculator function code"); func->u.p.code[*codeptr].type = PSINT; func->u.p.code[*codeptr].u.i = atoi(buf); ++*codeptr; break; case PDF_TREAL: error = resizecode(func, *codeptr); if (error) return fz_rethrow(error, "resize calculator function code"); func->u.p.code[*codeptr].type = PSREAL; func->u.p.code[*codeptr].u.f = atof(buf); ++*codeptr; break; case PDF_TOBRACE: opptr = *codeptr; *codeptr += 3; error = resizecode(func, opptr + 2); if (error) return fz_rethrow(error, "resize calculator function code"); error = parsecode(func, stream, codeptr); if (error) return fz_rethrow(error, "error in 'if' branch"); error = pdf_lex(&tok, stream, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "calculator function syntax error"); if (tok == PDF_TOBRACE) { elseptr = *codeptr; error = parsecode(func, stream, codeptr); if (error) return fz_rethrow(error, "error in 'else' branch"); error = pdf_lex(&tok, stream, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "calculator function syntax error"); } else { elseptr = -1; } if (tok == PDF_TKEYWORD) { if (!strcmp(buf, "if")) { if (elseptr >= 0) return fz_throw("too many branches for 'if'"); func->u.p.code[opptr].type = PSOPERATOR; func->u.p.code[opptr].u.op = PSOIF; func->u.p.code[opptr+2].type = PSBLOCK; func->u.p.code[opptr+2].u.block = *codeptr; } else if (!strcmp(buf, "ifelse")) { if (elseptr < 0) return fz_throw("not enough branches for 'ifelse'"); func->u.p.code[opptr].type = PSOPERATOR; func->u.p.code[opptr].u.op = PSOIFELSE; func->u.p.code[opptr+1].type = PSBLOCK; func->u.p.code[opptr+1].u.block = elseptr; func->u.p.code[opptr+2].type = PSBLOCK; func->u.p.code[opptr+2].u.block = *codeptr; } else { return fz_throw("unknown keyword in 'if-else' context: '%s'", buf); } } else { return fz_throw("missing keyword in 'if-else' context"); } break; case PDF_TCBRACE: error = resizecode(func, *codeptr); if (error) return fz_rethrow(error, "resize calculator function code"); func->u.p.code[*codeptr].type = PSOPERATOR; func->u.p.code[*codeptr].u.op = PSORETURN; ++*codeptr; return fz_okay; case PDF_TKEYWORD: cmp = -1; a = -1; b = sizeof(psopnames) / sizeof(psopnames[0]); while (b - a > 1) { mid = (a + b) / 2; cmp = strcmp(buf, psopnames[mid]); if (cmp > 0) a = mid; else if (cmp < 0) b = mid; else a = b = mid; } if (cmp != 0) return fz_throw("unknown operator: '%s'", buf); error = resizecode(func, *codeptr); if (error) return fz_rethrow(error, "resize calculator function code"); func->u.p.code[*codeptr].type = PSOPERATOR; func->u.p.code[*codeptr].u.op = a; ++*codeptr; break; default: return fz_throw("calculator function syntax error"); } } }
pdf_obj * pdf_parse_ind_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf, int *onum, int *ogen, int *ostmofs) { pdf_obj *obj = NULL; int num = 0, gen = 0, stm_ofs; int tok; int a, b; fz_context *ctx = file->ctx; fz_var(obj); tok = pdf_lex(file, buf); /* RJW: cannot parse indirect object (%d %d R)", num, gen */ if (tok != PDF_TOK_INT) fz_throw(ctx, "expected object number (%d %d R)", num, gen); num = buf->i; tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ if (tok != PDF_TOK_INT) fz_throw(ctx, "expected generation number (%d %d R)", num, gen); gen = buf->i; tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ if (tok != PDF_TOK_OBJ) fz_throw(ctx, "expected 'obj' keyword (%d %d R)", num, gen); tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ switch (tok) { case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(xref, file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(xref, file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ break; case PDF_TOK_NAME: obj = fz_new_name(ctx, buf->scratch); break; case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break; case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break; case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break; case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break; case PDF_TOK_NULL: obj = pdf_new_null(ctx); break; case PDF_TOK_INT: a = buf->i; tok = pdf_lex(file, buf); /* "cannot parse indirect object (%d %d R)", num, gen */ if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = pdf_new_int(ctx, a); goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen); */ if (tok == PDF_TOK_R) { obj = pdf_new_indirect(ctx, a, b, xref); break; } } fz_throw(ctx, "expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = pdf_new_null(ctx); goto skip; default: fz_throw(ctx, "syntax error in object (%d %d R)", num, gen); } fz_try(ctx) { tok = pdf_lex(file, buf); } fz_catch(ctx) { pdf_drop_obj(obj); fz_throw(ctx, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); while (c == ' ') c = fz_read_byte(file); if (c == '\r') { c = fz_peek_byte(file); if (c != '\n') fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(file); } stm_ofs = fz_tell(file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; return obj; }
int pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root) { fz_stream *file = doc->file; pdf_token tok; int stm_len; *stmofsp = 0; if (stmlenp) *stmlenp = -1; stm_len = 0; /* On entry to this function, we know that we've just seen * '<int> <int> obj'. We expect the next thing we see to be a * pdf object. Regardless of the type of thing we meet next * we only need to fully parse it if it is a dictionary. */ tok = pdf_lex(ctx, file, buf); if (tok == PDF_TOK_OPEN_DICT) { pdf_obj *dict, *obj; fz_try(ctx) { dict = pdf_parse_dict(ctx, doc, file, buf); } fz_catch(ctx) { fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); /* Don't let a broken object at EOF overwrite a good one */ if (file->eof) fz_rethrow(ctx); /* Silently swallow the error */ dict = pdf_new_dict(ctx, NULL, 2); } /* We must be careful not to try to resolve any indirections * here. We have just read dict, so we know it to be a non * indirected dictionary. Before we look at any values that * we get back from looking up in it, we need to check they * aren't indirected. */ if (encrypt || id || root) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_XRef)) { if (encrypt) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt); if (obj) { pdf_drop_obj(ctx, *encrypt); *encrypt = pdf_keep_obj(ctx, obj); } } if (id) { obj = pdf_dict_get(ctx, dict, PDF_NAME_ID); if (obj) { pdf_drop_obj(ctx, *id); *id = pdf_keep_obj(ctx, obj); } } if (root) *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Root)); } } obj = pdf_dict_get(ctx, dict, PDF_NAME_Length); if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj)) stm_len = pdf_to_int(ctx, obj); if (doc->file_reading_linearly && page) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_Page)) { pdf_drop_obj(ctx, *page); *page = pdf_keep_obj(ctx, dict); } } pdf_drop_obj(ctx, dict); }