static void renumberobj(fz_obj *obj) { int i; fz_context *ctx = xref->ctx; if (fz_is_dict(obj)) { int n = fz_dict_len(obj); for (i = 0; i < n; i++) { fz_obj *key = fz_dict_get_key(obj, i); fz_obj *val = fz_dict_get_val(obj, i); if (fz_is_indirect(val)) { val = fz_new_indirect(ctx, renumbermap[fz_to_num(val)], 0, xref); fz_dict_put(obj, key, val); fz_drop_obj(val); } else { renumberobj(val); } } } else if (fz_is_array(obj)) { int n = fz_array_len(obj); for (i = 0; i < n; i++) { fz_obj *val = fz_array_get(obj, i); if (fz_is_indirect(val)) { val = fz_new_indirect(ctx, renumbermap[fz_to_num(val)], 0, xref); fz_array_put(obj, i, val); fz_drop_obj(val); } else { renumberobj(val); } } } }
static void renumberobjs(void) { pdf_xref_entry *oldxref; int newlen; int num; /* Apply renumber map to indirect references in all objects in xref */ renumberobj(xref->trailer); for (num = 0; num < xref->len; num++) { fz_obj *obj = xref->table[num].obj; if (fz_is_indirect(obj)) { obj = fz_new_indirect(ctx, renumbermap[fz_to_num(obj)], 0, xref); pdf_update_object(xref, num, 0, obj); fz_drop_obj(obj); } else { renumberobj(obj); } } /* Create new table for the reordered, compacted xref */ oldxref = xref->table; xref->table = fz_malloc_array(xref->ctx, xref->len, sizeof(pdf_xref_entry)); xref->table[0] = oldxref[0]; /* Move used objects into the new compacted xref */ newlen = 0; for (num = 1; num < xref->len; num++) { if (uselist[num]) { if (newlen < renumbermap[num]) newlen = renumbermap[num]; xref->table[renumbermap[num]] = oldxref[num]; } else { if (oldxref[num].obj) fz_drop_obj(oldxref[num].obj); } } fz_free(xref->ctx, oldxref); /* Update the used objects count in compacted xref */ xref->len = newlen + 1; /* Update list of used objects to fit with compacted xref */ for (num = 1; num < xref->len; num++) uselist[num] = 1; }
static void saveimage(int num) { fz_error error; fz_pixmap *img; fz_obj *ref; char name[1024]; ref = fz_new_indirect(ctx, num, 0, xref); /* TODO: detect DCTD and save as jpeg */ error = pdf_load_image(&img, xref, ref); if (error) die(error); if (dorgb && img->colorspace && img->colorspace != fz_device_rgb) { fz_pixmap *temp; temp = fz_new_pixmap_with_rect(ctx, fz_device_rgb, fz_bound_pixmap(img)); fz_convert_pixmap(ctx, img, temp); fz_drop_pixmap(ctx, img); img = temp; } if (img->n <= 4) { sprintf(name, "img-%04d.png", num); printf("extracting image %s\n", name); fz_write_png(ctx, img, name, 0); } else { sprintf(name, "img-%04d.pam", num); printf("extracting image %s\n", name); fz_write_pam(ctx, img, name, 0); } fz_drop_pixmap(ctx, img); fz_drop_obj(ctx, ref); }
static void retainpages(int argc, char **argv) { fz_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; /* Keep only pages/type and (reduced) dest entries to avoid * references to unretained pages */ oldroot = fz_dict_gets(xref->trailer, "Root"); pages = fz_dict_gets(oldroot, "Pages"); olddests = pdf_load_name_tree(xref, "Dests"); root = fz_new_dict(ctx, 2); fz_dict_puts(root, "Type", fz_dict_gets(oldroot, "Type")); fz_dict_puts(root, "Pages", fz_dict_gets(oldroot, "Pages")); pdf_update_object(xref, fz_to_num(oldroot), fz_to_gen(oldroot), root); fz_drop_obj(root); /* Create a new kids array with only the pages we want to keep */ parent = fz_new_indirect(ctx, fz_to_num(pages), fz_to_gen(pages), xref); kids = fz_new_array(ctx, 1); /* Retain pages specified */ while (argc - fz_optind) { int page, spage, epage; char *spec, *dash; char *pagelist = argv[fz_optind]; spec = fz_strsep(&pagelist, ","); while (spec) { dash = strchr(spec, '-'); if (dash == spec) spage = epage = pdf_count_pages(xref); else spage = epage = atoi(spec); if (dash) { if (strlen(dash) > 1) epage = atoi(dash + 1); else epage = pdf_count_pages(xref); } if (spage > epage) page = spage, spage = epage, epage = page; if (spage < 1) spage = 1; if (epage > pdf_count_pages(xref)) epage = pdf_count_pages(xref); for (page = spage; page <= epage; page++) { fz_obj *pageobj = xref->page_objs[page-1]; fz_obj *pageref = xref->page_refs[page-1]; fz_dict_puts(pageobj, "Parent", parent); /* Store page object in new kids array */ fz_array_push(kids, pageref); } spec = fz_strsep(&pagelist, ","); } fz_optind++; } fz_drop_obj(parent); /* Update page count and kids array */ countobj = fz_new_int(ctx, fz_array_len(kids)); fz_dict_puts(pages, "Count", countobj); fz_drop_obj(countobj); fz_dict_puts(pages, "Kids", kids); fz_drop_obj(kids); /* Also preserve the (partial) Dests name tree */ if (olddests) { int i; fz_obj *names = fz_new_dict(ctx, 1); fz_obj *dests = fz_new_dict(ctx, 1); fz_obj *names_list = fz_new_array(ctx, 32); for (i = 0; i < fz_dict_len(olddests); i++) { fz_obj *key = fz_dict_get_key(olddests, i); fz_obj *val = fz_dict_get_val(olddests, i); fz_obj *key_str = fz_new_string(ctx, fz_to_name(key), strlen(fz_to_name(key))); fz_obj *dest = fz_dict_gets(val, "D"); dest = fz_array_get(dest ? dest : val, 0); if (fz_array_contains(fz_dict_gets(pages, "Kids"), dest)) { fz_array_push(names_list, key_str); fz_array_push(names_list, val); } fz_drop_obj(key_str); } root = fz_dict_gets(xref->trailer, "Root"); fz_dict_puts(dests, "Names", names_list); fz_dict_puts(names, "Dests", dests); fz_dict_puts(root, "Names", names); fz_drop_obj(names); fz_drop_obj(dests); fz_drop_obj(names_list); fz_drop_obj(olddests); } }
fz_error pdf_open_xref_with_stream(pdf_xref **xrefp, fz_stream *file, char *password) { pdf_xref *xref; fz_error error; fz_obj *encrypt, *id; fz_obj *dict, *obj; int i, repaired = 0; /* install pdf specific callback */ fz_resolve_indirect = pdf_resolve_indirect; xref = fz_malloc(sizeof(pdf_xref)); memset(xref, 0, sizeof(pdf_xref)); xref->file = fz_keep_stream(file); error = pdf_load_xref(xref, xref->scratch, sizeof xref->scratch); if (error) { fz_catch(error, "trying to repair"); if (xref->table) { fz_free(xref->table); xref->table = NULL; xref->len = 0; } if (xref->trailer) { fz_drop_obj(xref->trailer); xref->trailer = NULL; } error = pdf_repair_xref(xref, xref->scratch, sizeof xref->scratch); if (error) { pdf_free_xref(xref); return fz_rethrow(error, "cannot repair document"); } repaired = 1; } encrypt = fz_dict_gets(xref->trailer, "Encrypt"); id = fz_dict_gets(xref->trailer, "ID"); if (fz_is_dict(encrypt)) { error = pdf_new_crypt(&xref->crypt, encrypt, id); if (error) { pdf_free_xref(xref); return fz_rethrow(error, "cannot decrypt document"); } } if (pdf_needs_password(xref)) { /* Only care if we have a password */ if (password) { int okay = pdf_authenticate_password(xref, password); if (!okay) { pdf_free_xref(xref); return fz_throw("invalid password"); } } } if (repaired) { int hasroot, hasinfo; error = pdf_repair_obj_stms(xref); if (error) { pdf_free_xref(xref); return fz_rethrow(error, "cannot repair document"); } hasroot = fz_dict_gets(xref->trailer, "Root") != NULL; hasinfo = fz_dict_gets(xref->trailer, "Info") != NULL; for (i = 1; i < xref->len; i++) { if (xref->table[i].type == 0 || xref->table[i].type == 'f') continue; error = pdf_load_object(&dict, xref, i, 0); if (error) { fz_catch(error, "ignoring broken object (%d 0 R)", i); continue; } if (!hasroot) { obj = fz_dict_gets(dict, "Type"); if (fz_is_name(obj) && !strcmp(fz_to_name(obj), "Catalog")) { obj = fz_new_indirect(i, 0, xref); fz_dict_puts(xref->trailer, "Root", obj); fz_drop_obj(obj); } } if (!hasinfo) { if (fz_dict_gets(dict, "Creator") || fz_dict_gets(dict, "Producer")) { obj = fz_new_indirect(i, 0, xref); fz_dict_puts(xref->trailer, "Info", obj); fz_drop_obj(obj); } } fz_drop_obj(dict); } } error = pdf_read_ocg(xref); if (error) { pdf_free_xref(xref); return fz_rethrow(error, "Broken Optional Content"); } *xrefp = xref; return fz_okay; }
fz_error pdf_parse_ind_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap, int *onum, int *ogen, int *ostmofs) { fz_error error = fz_okay; fz_obj *obj = NULL; int num = 0, gen = 0, stm_ofs; int tok; int len; int a, b; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_INT) return fz_throw("expected object number (%d %d R)", num, gen); num = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_INT) return fz_throw("expected generation number (%d %d R)", num, gen); gen = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_OBJ) return fz_throw("expected 'obj' keyword (%d %d R)", num, gen); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TOK_NAME: obj = fz_new_name(buf); break; case PDF_TOK_REAL: obj = fz_new_real(fz_atof(buf)); break; case PDF_TOK_STRING: obj = fz_new_string(buf, len); break; case PDF_TOK_TRUE: obj = fz_new_bool(1); break; case PDF_TOK_FALSE: obj = fz_new_bool(0); break; case PDF_TOK_NULL: obj = fz_new_null(); break; case PDF_TOK_INT: a = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = fz_new_int(a); goto skip; } if (tok == PDF_TOK_INT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TOK_R) { obj = fz_new_indirect(a, b, xref); break; } } return fz_throw("expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = fz_new_null(); goto skip; default: return fz_throw("syntax error in object (%d %d R)", num, gen); } error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(obj); return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); while (c == ' ') c = fz_read_byte(file); if (c == '\r') { c = fz_peek_byte(file); if (c != '\n') fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(file); } stm_ofs = fz_tell(file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn("expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; *op = obj; return fz_okay; }
fz_error pdf_parse_dict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *dict = NULL; fz_obj *key = NULL; fz_obj *val = NULL; int tok; int len; int a, b; dict = fz_new_dict(8); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } skip: if (tok == PDF_TOK_CLOSE_DICT) { *op = dict; return fz_okay; } /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID")) { *op = dict; return fz_okay; } if (tok != PDF_TOK_NAME) { fz_drop_obj(dict); return fz_throw("invalid key in dict"); } key = fz_new_name(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&val, xref, file, buf, cap); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&val, xref, file, buf, cap); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TOK_NAME: val = fz_new_name(buf); break; case PDF_TOK_REAL: val = fz_new_real(fz_atof(buf)); break; case PDF_TOK_STRING: val = fz_new_string(buf, len); break; case PDF_TOK_TRUE: val = fz_new_bool(1); break; case PDF_TOK_FALSE: val = fz_new_bool(0); break; case PDF_TOK_NULL: val = fz_new_null(); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = (int) strtoll(buf, 0, 10); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID"))) { val = fz_new_int(a); fz_dict_put(dict, key, val); fz_drop_obj(val); fz_drop_obj(key); goto skip; } if (tok == PDF_TOK_INT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TOK_R) { val = fz_new_indirect(a, b, xref); break; } } fz_drop_obj(key); fz_drop_obj(dict); return fz_throw("invalid indirect reference in dict"); default: fz_drop_obj(key); fz_drop_obj(dict); return fz_throw("unknown token in dict"); } fz_dict_put(dict, key, val); fz_drop_obj(val); fz_drop_obj(key); } }
fz_error pdf_parse_array(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *ary = NULL; fz_obj *obj = NULL; int a = 0, b = 0, n = 0; int tok; int len; ary = fz_new_array(4); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ary); return fz_rethrow(error, "cannot parse array"); } if (tok != PDF_TOK_INT && tok != PDF_TOK_R) { if (n > 0) { obj = fz_new_int(a); fz_array_push(ary, obj); fz_drop_obj(obj); } if (n > 1) { obj = fz_new_int(b); fz_array_push(ary, obj); fz_drop_obj(obj); } n = 0; } if (tok == PDF_TOK_INT && n == 2) { obj = fz_new_int(a); fz_array_push(ary, obj); fz_drop_obj(obj); a = b; n --; } switch (tok) { case PDF_TOK_CLOSE_ARRAY: *op = ary; return fz_okay; case PDF_TOK_INT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TOK_R: if (n != 2) { fz_drop_obj(ary); return fz_throw("cannot parse indirect reference in array"); } obj = fz_new_indirect(a, b, xref); fz_array_push(ary, obj); fz_drop_obj(obj); n = 0; break; case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&obj, xref, file, buf, cap); if (error) { fz_drop_obj(ary); return fz_rethrow(error, "cannot parse array"); } fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&obj, xref, file, buf, cap); if (error) { fz_drop_obj(ary); return fz_rethrow(error, "cannot parse array"); } fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_NAME: obj = fz_new_name(buf); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_REAL: obj = fz_new_real(fz_atof(buf)); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_STRING: obj = fz_new_string(buf, len); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_TRUE: obj = fz_new_bool(1); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_FALSE: obj = fz_new_bool(0); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_NULL: obj = fz_new_null(); fz_array_push(ary, obj); fz_drop_obj(obj); break; default: fz_drop_obj(ary); return fz_throw("cannot parse token in array"); } } }