static void writexref(void) { fz_obj *trailer; fz_obj *obj; int startxref; int num; startxref = ftell(out); fprintf(out, "xref\n0 %d\n", xref->len); for (num = 0; num < xref->len; num++) { if (uselist[num]) fprintf(out, "%010d %05d n \n", ofslist[num], genlist[num]); else fprintf(out, "%010d %05d f \n", ofslist[num], genlist[num]); } fprintf(out, "\n"); trailer = fz_newdict(5); obj = fz_newint(xref->len); fz_dictputs(trailer, "Size", obj); fz_dropobj(obj); obj = fz_dictgets(xref->trailer, "Info"); if (obj) fz_dictputs(trailer, "Info", obj); obj = fz_dictgets(xref->trailer, "Root"); if (obj) fz_dictputs(trailer, "Root", obj); obj = fz_dictgets(xref->trailer, "ID"); if (obj) fz_dictputs(trailer, "ID", obj); fprintf(out, "trailer\n"); fz_fprintobj(out, trailer, !doexpand); fprintf(out, "\n"); fz_dropobj(trailer); fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref); }
fz_error * pdf_loadnametree(fz_obj **dictp, pdf_xref *xref, fz_obj *root) { fz_error *error; fz_obj *tree; error = fz_newdict(&tree, 128); if (error) return error; error = loadnametreenode(tree, xref, root); if (error) { fz_dropobj(tree); return error; } fz_sortdict(tree); *dictp = tree; return nil; }
static void retainpages(int argc, char **argv) { fz_error error; fz_obj *oldroot, *root, *pages, *kids, *countobj, *parent; /* Load the old page tree */ error = pdf_loadpagetree(xref); if (error) die(fz_rethrow(error, "cannot load page tree")); /* Keep only pages/type entry to avoid references to unretained pages */ oldroot = fz_dictgets(xref->trailer, "Root"); pages = fz_dictgets(oldroot, "Pages"); root = fz_newdict(2); fz_dictputs(root, "Type", fz_dictgets(oldroot, "Type")); fz_dictputs(root, "Pages", fz_dictgets(oldroot, "Pages")); pdf_updateobject(xref, fz_tonum(oldroot), fz_togen(oldroot), root); fz_dropobj(root); /* Create a new kids array with only the pages we want to keep */ parent = fz_newindirect(fz_tonum(pages), fz_togen(pages), xref); kids = fz_newarray(1); /* Retain pages specified */ while (argc - fz_optind) { int page, spage, epage; char *spec, *dash; char *pagelist = argv[fz_optind]; spec = fz_strsep(&pagelist, ","); while (spec) { dash = strchr(spec, '-'); if (dash == spec) spage = epage = pdf_getpagecount(xref); else spage = epage = atoi(spec); if (dash) { if (strlen(dash) > 1) epage = atoi(dash + 1); else epage = pdf_getpagecount(xref); } if (spage > epage) page = spage, spage = epage, epage = page; if (spage < 1) spage = 1; if (epage > pdf_getpagecount(xref)) epage = pdf_getpagecount(xref); for (page = spage; page <= epage; page++) { fz_obj *pageobj = pdf_getpageobject(xref, page); fz_obj *pageref = pdf_getpageref(xref, page); fz_dictputs(pageobj, "Parent", parent); /* Store page object in new kids array */ fz_arraypush(kids, pageref); } spec = fz_strsep(&pagelist, ","); } fz_optind++; } fz_dropobj(parent); /* Update page count and kids array */ countobj = fz_newint(fz_arraylen(kids)); fz_dictputs(pages, "Count", countobj); fz_dropobj(countobj); fz_dictputs(pages, "Kids", kids); fz_dropobj(kids); }
fz_error pdf_parsedict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; pdf_token_e tok; int len; int a, b; dict = fz_newdict(8); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } skip: if (tok == PDF_TCDICT) { *op = dict; return fz_okay; } /* for BI .. ID .. EI in content streams */ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID")) { *op = dict; return fz_okay; } if (tok != PDF_TNAME) { fz_dropobj(dict); return fz_throw("invalid key in dict");; } key = fz_newname(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&val, xref, file, buf, cap); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TODICT: error = pdf_parsedict(&val, xref, file, buf, cap); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TNAME: val = fz_newname(buf); break; case PDF_TREAL: val = fz_newreal(atof(buf)); break; case PDF_TSTRING: val = fz_newstring(buf, len); break; case PDF_TTRUE: val = fz_newbool(1); break; case PDF_TFALSE: val = fz_newbool(0); break; case PDF_TNULL: val = fz_newnull(); break; case PDF_TINT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = (int) strtoll(buf, 0, 10); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TCDICT || tok == PDF_TNAME || (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { val = fz_newint(a); fz_dictput(dict, key, val); fz_dropobj(val); fz_dropobj(key); goto skip; } if (tok == PDF_TINT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TR) { val = fz_newindirect(a, b, xref); break; } } fz_dropobj(key); fz_dropobj(dict); return fz_throw("invalid indirect reference in dict"); default: return fz_throw("unknown token in dict"); } fz_dictput(dict, key, val); fz_dropobj(val); fz_dropobj(key); } }
static fz_error parsedict(fz_obj **obj, pdf_xref *xref, char **sp, struct vap *v) { fz_error error = fz_okay; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; char *s = *sp; error = fz_newdict(&dict, 8); if (error) return fz_rethrow(error, "cannot create dict"); s += 2; /* skip "<<" */ while (*s) { skipwhite(&s); /* end-of-dict marker >> */ if (*s == '>') { s ++; if (*s == '>') { s ++; break; } error = fz_throw("malformed >> marker"); goto cleanup; } /* non-name as key, bail */ if (*s != '/') { error = fz_throw("key is not a name"); goto cleanup; } error = parsename(&key, &s); if (error) { error = fz_rethrow(error, "cannot parse key"); goto cleanup; } skipwhite(&s); error = parseobj(&val, xref, &s, v); if (error) { error = fz_rethrow(error, "cannot parse value"); goto cleanup; } error = fz_dictput(dict, key, val); if (error) { error = fz_rethrow(error, "cannot insert dict entry"); goto cleanup; } fz_dropobj(val); val = nil; fz_dropobj(key); key = nil; } *obj = dict; *sp = s; return fz_okay; cleanup: if (val) fz_dropobj(val); if (key) fz_dropobj(key); if (dict) fz_dropobj(dict); *obj = nil; *sp = s; return error; /* already rethrown */ }
fz_error * pdf_parsedict(fz_obj **op, fz_stream *file, char *buf, int cap) { fz_error *error = nil; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; int tok, len; int a, b; error = fz_newdict(op, 8); if (error) return error; dict = *op; while (1) { tok = pdf_lex(file, buf, cap, &len); skip: if (tok == PDF_TCDICT) return nil; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID")) return nil; if (tok != PDF_TNAME) goto cleanup; error = fz_newname(&key, buf); if (error) goto cleanup; tok = pdf_lex(file, buf, cap, &len); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&val, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&val, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&val, buf); break; case PDF_TREAL: error = fz_newreal(&val, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&val, buf, len); break; case PDF_TTRUE: error = fz_newbool(&val, 1); break; case PDF_TFALSE: error = fz_newbool(&val, 0); break; case PDF_TNULL: error = fz_newnull(&val); break; case PDF_TINT: a = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TCDICT || tok == PDF_TNAME || (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { error = fz_newint(&val, a); if (error) goto cleanup; error = fz_dictput(dict, key, val); if (error) goto cleanup; fz_dropobj(val); fz_dropobj(key); key = val = nil; goto skip; } if (tok == PDF_TINT) { b = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TR) { error = fz_newindirect(&val, a, b); break; } } goto cleanup; default: goto cleanup; } if (error) goto cleanup; error = fz_dictput(dict, key, val); if (error) goto cleanup; fz_dropobj(val); fz_dropobj(key); key = val = nil; } cleanup: if (key) fz_dropobj(key); if (val) fz_dropobj(val); if (dict) fz_dropobj(dict); if (error) return error; return fz_throw("syntaxerror: corrupt dictionary"); }
fz_error pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) { fz_error error; fz_obj *dict, *obj; fz_obj *length; fz_obj *encrypt = nil; fz_obj *id = nil; fz_obj *root = nil; fz_obj *info = nil; struct entry *list = nil; int listlen; int listcap; int maxnum = 0; int num = 0; int gen = 0; int tmpofs, numofs = 0, genofs = 0; int stmlen, stmofs = 0; int tok; int next; int i, n; pdf_logxref("repairxref %p\n", xref); fz_seek(xref->file, 0, 0); listlen = 0; listcap = 1024; list = fz_calloc(listcap, sizeof(struct entry)); /* look for '%PDF' version marker within first kilobyte of file */ n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024)); if (n < 0) { error = fz_rethrow(n, "cannot read from file"); goto cleanup; } fz_seek(xref->file, 0, 0); for (i = 0; i < n - 4; i++) { if (memcmp(buf + i, "%PDF", 4) == 0) { fz_seek(xref->file, i, 0); break; } } while (1) { tmpofs = fz_tell(xref->file); if (tmpofs < 0) { error = fz_throw("cannot tell in file"); goto cleanup; } error = pdf_lex(&tok, xref->file, buf, bufsize, &n); if (error) { fz_catch(error, "ignoring the rest of the file"); break; } if (tok == PDF_TINT) { numofs = genofs; num = gen; genofs = tmpofs; gen = atoi(buf); } if (tok == PDF_TOBJ) { error = fz_repairobj(xref->file, buf, bufsize, &stmofs, &stmlen, &encrypt, &id); if (error) { error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen); goto cleanup; } pdf_logxref("found object: (%d %d R)\n", num, gen); if (listlen + 1 == listcap) { listcap = (listcap * 3) / 2; list = fz_realloc(list, listcap, sizeof(struct entry)); } list[listlen].num = num; list[listlen].gen = gen; list[listlen].ofs = numofs; list[listlen].stmofs = stmofs; list[listlen].stmlen = stmlen; listlen ++; if (num > maxnum) maxnum = num; } /* trailer dictionary */ if (tok == PDF_TODICT) { error = pdf_parsedict(&dict, xref, xref->file, buf, bufsize); if (error) { error = fz_rethrow(error, "cannot parse object"); goto cleanup; } obj = fz_dictgets(dict, "Encrypt"); if (obj) { if (encrypt) fz_dropobj(encrypt); encrypt = fz_keepobj(obj); } obj = fz_dictgets(dict, "ID"); if (obj) { if (id) fz_dropobj(id); id = fz_keepobj(obj); } obj = fz_dictgets(dict, "Root"); if (obj) { if (root) fz_dropobj(root); root = fz_keepobj(obj); } obj = fz_dictgets(dict, "Info"); if (obj) { if (info) fz_dropobj(info); info = fz_keepobj(obj); } fz_dropobj(dict); } if (tok == PDF_TERROR) fz_readbyte(xref->file); if (tok == PDF_TEOF) break; } /* make xref reasonable */ pdf_resizexref(xref, maxnum + 1); for (i = 0; i < listlen; i++) { xref->table[list[i].num].type = 'n'; xref->table[list[i].num].ofs = list[i].ofs; xref->table[list[i].num].gen = list[i].gen; xref->table[list[i].num].stmofs = list[i].stmofs; /* corrected stream length */ if (list[i].stmlen >= 0) { pdf_logxref("correct stream length %d %d = %d\n", list[i].num, list[i].gen, list[i].stmlen); error = pdf_loadobject(&dict, xref, list[i].num, list[i].gen); if (error) { error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen); goto cleanup; } length = fz_newint(list[i].stmlen); fz_dictputs(dict, "Length", length); fz_dropobj(length); fz_dropobj(dict); } } xref->table[0].type = 'f'; xref->table[0].ofs = 0; xref->table[0].gen = 65535; xref->table[0].stmofs = 0; xref->table[0].obj = nil; next = 0; for (i = xref->len - 1; i >= 0; i--) { if (xref->table[i].type == 'f') { xref->table[i].ofs = next; if (xref->table[i].gen < 65535) xref->table[i].gen ++; next = i; } } /* create a repaired trailer, Root will be added later */ xref->trailer = fz_newdict(5); obj = fz_newint(maxnum + 1); fz_dictputs(xref->trailer, "Size", obj); fz_dropobj(obj); if (root) { fz_dictputs(xref->trailer, "Root", root); fz_dropobj(root); } if (info) { fz_dictputs(xref->trailer, "Info", info); fz_dropobj(info); } if (encrypt) { if (fz_isindirect(encrypt)) { /* create new reference with non-nil xref pointer */ obj = fz_newindirect(fz_tonum(encrypt), fz_togen(encrypt), xref); fz_dropobj(encrypt); encrypt = obj; } fz_dictputs(xref->trailer, "Encrypt", encrypt); fz_dropobj(encrypt); } if (id) { if (fz_isindirect(id)) { /* create new reference with non-nil xref pointer */ obj = fz_newindirect(fz_tonum(id), fz_togen(id), xref); fz_dropobj(id); id = obj; } fz_dictputs(xref->trailer, "ID", id); fz_dropobj(id); } fz_free(list); return fz_okay; cleanup: if (encrypt) fz_dropobj(encrypt); if (id) fz_dropobj(id); if (root) fz_dropobj(root); if (info) fz_dropobj(info); fz_free(list); return error; /* already rethrown */ }