static void renumberobjs(void) { pdf_xrefentry *oldxref; int newlen; int num; /* Apply renumber map to indirect references in all objects in xref */ renumberobj(xref->trailer); for (num = 0; num < xref->len; num++) { fz_obj *obj = xref->table[num].obj; if (fz_isindirect(obj)) { obj = fz_newindirect(renumbermap[fz_tonum(obj)], 0, xref); pdf_updateobject(xref, num, 0, obj); fz_dropobj(obj); } else { renumberobj(obj); } } /* Create new table for the reordered, compacted xref */ oldxref = xref->table; xref->table = fz_calloc(xref->len, sizeof(pdf_xrefentry)); xref->table[0] = oldxref[0]; /* Move used objects into the new compacted xref */ newlen = 0; for (num = 1; num < xref->len; num++) { if (uselist[num]) { if (newlen < renumbermap[num]) newlen = renumbermap[num]; xref->table[renumbermap[num]] = oldxref[num]; } else { if (oldxref[num].obj) fz_dropobj(oldxref[num].obj); } } fz_free(oldxref); /* Update the used objects count in compacted xref */ xref->len = newlen + 1; /* Update list of used objects to fit with compacted xref */ for (num = 1; num < xref->len; num++) uselist[num] = 1; }
static void renumberobj(fz_obj *obj) { int i; if (fz_isdict(obj)) { for (i = 0; i < fz_dictlen(obj); i++) { fz_obj *key = fz_dictgetkey(obj, i); fz_obj *val = fz_dictgetval(obj, i); if (fz_isindirect(val)) { val = fz_newindirect(renumbermap[fz_tonum(val)], 0, xref); fz_dictput(obj, key, val); fz_dropobj(val); } else { renumberobj(val); } } } else if (fz_isarray(obj)) { for (i = 0; i < fz_arraylen(obj); i++) { fz_obj *val = fz_arrayget(obj, i); if (fz_isindirect(val)) { val = fz_newindirect(renumbermap[fz_tonum(val)], 0, xref); fz_arrayput(obj, i, val); fz_dropobj(val); } else { renumberobj(val); } } } }
static fz_error parseobj(fz_obj **obj, pdf_xref *xref, char **sp, struct vap *v) { fz_error error; char buf[32]; int num, gen, len; char *tmp; char *s = *sp; if (*s == '\0') return fz_throw("end of data"); skipwhite(&s); error = fz_okay; if (v != nil && *s == '%') { s ++; switch (*s) { case 'o': *obj = fz_keepobj(va_arg(v->ap, fz_obj*)); break; case 'b': error = fz_newbool(obj, va_arg(v->ap, int)); break; case 'i': error = fz_newint(obj, va_arg(v->ap, int)); break; case 'f': error = fz_newreal(obj, (float)va_arg(v->ap, double)); break; case 'n': error = fz_newname(obj, va_arg(v->ap, char*)); break; case 'r': num = va_arg(v->ap, int); gen = va_arg(v->ap, int); error = fz_newindirect(obj, num, gen, xref); break; case 's': tmp = va_arg(v->ap, char*); error = fz_newstring(obj, tmp, strlen(tmp)); break; case '#': tmp = va_arg(v->ap, char*); len = va_arg(v->ap, int); error = fz_newstring(obj, tmp, len); break; default: error = fz_throw("unknown format specifier in packobj: '%c'", *s); break; } if (error) error = fz_rethrow(error, "cannot create object for %% format"); s ++; }
static void retainpages(int argc, char **argv) { fz_error error; fz_obj *oldroot, *root, *pages, *kids, *countobj, *parent; /* Load the old page tree */ error = pdf_loadpagetree(xref); if (error) die(fz_rethrow(error, "cannot load page tree")); /* Keep only pages/type entry to avoid references to unretained pages */ oldroot = fz_dictgets(xref->trailer, "Root"); pages = fz_dictgets(oldroot, "Pages"); root = fz_newdict(2); fz_dictputs(root, "Type", fz_dictgets(oldroot, "Type")); fz_dictputs(root, "Pages", fz_dictgets(oldroot, "Pages")); pdf_updateobject(xref, fz_tonum(oldroot), fz_togen(oldroot), root); fz_dropobj(root); /* Create a new kids array with only the pages we want to keep */ parent = fz_newindirect(fz_tonum(pages), fz_togen(pages), xref); kids = fz_newarray(1); /* Retain pages specified */ while (argc - fz_optind) { int page, spage, epage; char *spec, *dash; char *pagelist = argv[fz_optind]; spec = fz_strsep(&pagelist, ","); while (spec) { dash = strchr(spec, '-'); if (dash == spec) spage = epage = pdf_getpagecount(xref); else spage = epage = atoi(spec); if (dash) { if (strlen(dash) > 1) epage = atoi(dash + 1); else epage = pdf_getpagecount(xref); } if (spage > epage) page = spage, spage = epage, epage = page; if (spage < 1) spage = 1; if (epage > pdf_getpagecount(xref)) epage = pdf_getpagecount(xref); for (page = spage; page <= epage; page++) { fz_obj *pageobj = pdf_getpageobject(xref, page); fz_obj *pageref = pdf_getpageref(xref, page); fz_dictputs(pageobj, "Parent", parent); /* Store page object in new kids array */ fz_arraypush(kids, pageref); } spec = fz_strsep(&pagelist, ","); } fz_optind++; } fz_dropobj(parent); /* Update page count and kids array */ countobj = fz_newint(fz_arraylen(kids)); fz_dictputs(pages, "Count", countobj); fz_dropobj(countobj); fz_dictputs(pages, "Kids", kids); fz_dropobj(kids); }
int copyPdfFile( soPdfFile* inFile, soPdfFile* outFile ) { fz_error *error; int pageTreeNum, pageTreeGen; assert(inFile != NULL); assert(outFile != NULL); // // Process every page in the source file // { printf("\nProcessing input page : "); for (int pageNo = 0; pageNo < pdf_getpagecount(inFile->pageTree); pageNo++) { displayPageNumber(pageNo + 1, !pageNo); // Get the page object from the source fz_obj *pageRef = inFile->pageTree->pref[pageNo]; fz_obj *pageObj = pdf_getpageobject(inFile->pageTree, pageNo); // // Process the page. Each page can be split into up-to 3 pages // fz_rect bbRect[3]; error = processPage(inFile, pageNo, bbRect, 3); if (error) return soPdfError(error); for (int ctr = 0; ctr < 3; ctr++) { // Check if this was a blank page if (fz_isemptyrect(bbRect[ctr])) break; // // copy the source page dictionary entry. The way this is done is basically // by making a copy of the page dict object in the source file, and adding // the copy in the source file. Then the copied page dict object is // referenced and added to the destination file. // // This convoluted procedure is done because the copy is done by pdf_transplant // function that accepts a source and destination. Whatever is referenced by // destination object is deep copied // // allocate an object id and generation id in source file // // There is a bug in mupdf where the object allocation returns // 0 oid and 0 gid when the input pdf file has iref stream // so to work around the issue, we wrap the pdf_allocojbect // in a for loop 10 times to get the number // int sNum, sGen, tries; for (tries = 0; tries < 10; tries++) { error = pdf_allocobject(inFile->xref, &sNum, &sGen); if (error) return soPdfError(error); // If sNum is non zero then the allocation was successful if (sNum != 0) break; pdf_updateobject(inFile->xref, sNum, sGen, pageObj); } // If we didn't succeed even after 10 tries then this file // is not going to work. if (tries >= 10) return soPdfError(fz_throw("cannot allocate object because of mupdf bug")); // make a deep copy of the original page dict fz_obj *pageObj2; error = fz_deepcopydict(&pageObj2, pageObj); if (error) return soPdfError(error); // update the source file with the duplicate page object pdf_updateobject(inFile->xref, sNum, sGen, pageObj2); fz_dropobj(pageObj2); // create an indirect reference to the page object fz_obj *pageRef2; error = fz_newindirect(&pageRef2, sNum, sGen); if (error) return soPdfError(error); // delete the parent dictionary entry // Do we need to delete any other dictionary entry // like annot, tabs, metadata, etc fz_dictdels(pageObj2, "Parent"); // Set the media box setPageMediaBox(inFile->xref, pageObj2, bbRect[ctr]); // Set the rotation based on input switch(p_mode) { // no rotation if fit height case FitHeight: case Fit2xHeight: break; // rotate -90 deg if fit width case Fit2xWidth: case FitWidth: setPageRotate(pageObj2, p_reverseLandscape ? 90 : -90); break; case SmartFitHeight: case SmartFitWidth: default: return soPdfError(fz_throw("Mode(%d) not yet implemented.", p_mode)); break; } // push the indirect reference to the destination list for copy by pdf_transplant error = fz_arraypush(outFile->editobjs, pageRef2); if (error) return soPdfError(error); } } } // flush the objects into destination from source { fz_obj *results; int outPages; printf("\nCopying output page : "); error = pdf_transplant(outFile->xref, inFile->xref, &results, outFile->editobjs); if (error) return soPdfError(error); outPages = fz_arraylen(results); for (int ctr = 0; ctr < outPages; ctr++) { displayPageNumber(ctr + 1, !ctr); error = fz_arraypush(outFile->pagelist, fz_arrayget(results, p_reverseLandscape ? outPages - 1 - ctr : ctr)); if (error) return soPdfError(error); } fz_dropobj(results); } // flush page tree // Create page tree and add back-links { fz_obj *pageTreeObj; fz_obj *pageTreeRef; // allocate a new object in out file for pageTree object error = pdf_allocobject(outFile->xref, &pageTreeNum, &pageTreeGen); if (error) return soPdfError(error); // Create a page tree object error = fz_packobj(&pageTreeObj, "<</Type/Pages/Count %i/Kids %o>>", fz_arraylen(outFile->pagelist), outFile->pagelist); if (error) return soPdfError(error); // Update the xref entry with the pageTree object pdf_updateobject(outFile->xref, pageTreeNum, pageTreeGen, pageTreeObj); fz_dropobj(pageTreeObj); // Create a reference to the pageTree object error = fz_newindirect(&pageTreeRef, pageTreeNum, pageTreeGen); if (error) return soPdfError(error); // // For every page in the output file, update the parent entry // for (int ctr = 0; ctr < fz_arraylen(outFile->pagelist); ctr++) { fz_obj *pageObj; int num = fz_tonum(fz_arrayget(outFile->pagelist, ctr)); int gen = fz_togen(fz_arrayget(outFile->pagelist, ctr)); // Get the page object from xreft error = pdf_loadobject(&pageObj, outFile->xref, num, gen); if (error) return soPdfError(error); // Update the parent entry in the page dictionary error = fz_dictputs(pageObj, "Parent", pageTreeRef); if (error) return soPdfError(error); // Update the entry with the updated page object pdf_updateobject(outFile->xref, num, gen, pageObj); fz_dropobj(pageObj); } } // Create catalog and root entries { fz_obj *catObj, *infoObj; int rootNum, rootGen; int infoNum, infoGen; // // Copy the info catalog to the destination // alloc an object id and gen id in destination file error = pdf_allocobject(outFile->xref, &infoNum, &infoGen); if (error) return soPdfError(error); // make a deep copy of the original page dict error = fz_deepcopydict(&infoObj, inFile->xref->info); if (error) return soPdfError(error); // update the dest file with object pdf_updateobject(outFile->xref, infoNum, infoGen, infoObj); outFile->xref->info = infoObj; fz_dropobj(infoObj); // // root/catalog object creation error = pdf_allocobject(outFile->xref, &rootNum, &rootGen); if (error) return soPdfError(error); error = fz_packobj(&catObj, "<</Type/Catalog /Pages %r>>", pageTreeNum, pageTreeGen); if (error) return soPdfError(error); pdf_updateobject(outFile->xref, rootNum, rootGen, catObj); fz_dropobj(catObj); // Create trailer error = fz_packobj(&outFile->xref->trailer, "<</Root %r /Info %r>>", rootNum, rootGen, infoNum, infoGen); if (error) return soPdfError(error); } // Update the info in the target file and save the xref printf("\nSaving.\n"); error = setPageInfo(inFile, outFile); if (error) return soPdfError(error); error = pdf_savexref(outFile->xref, outFile->fileName, NULL); if (error) return soPdfError(error); if (g_errorCount != 0) { printf("\nFollowing issues encounted were ignored.\n\n"); for (int ctr = g_errorCount - 1; ctr >= 0; ctr--) soPdfError(g_errorList[ctr]); } printf("\nSaved.\n"); return 0; }
fz_error pdf_parseindobj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap, int *onum, int *ogen, int *ostmofs) { fz_error error = fz_okay; fz_obj *obj = nil; int num = 0, gen = 0, stmofs; pdf_token_e tok; int len; int a, b; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TINT) return fz_throw("cannot parse indirect object (%d %d R)", num, gen); num = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TINT) return fz_throw("cannot parse indirect object (%d %d R)", num, gen); gen = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOBJ) return fz_throw("cannot parse indirect object (%d %d R)", num, gen); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TODICT: error = pdf_parsedict(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TNAME: obj = fz_newname(buf); break; case PDF_TREAL: obj = fz_newreal(atof(buf)); break; case PDF_TSTRING: obj = fz_newstring(buf, len); break; case PDF_TTRUE: obj = fz_newbool(1); break; case PDF_TFALSE: obj = fz_newbool(0); break; case PDF_TNULL: obj = fz_newnull(); break; case PDF_TINT: a = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TSTREAM || tok == PDF_TENDOBJ) { obj = fz_newint(a); goto skip; } if (tok == PDF_TINT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TR) { obj = fz_newindirect(a, b, xref); break; } } return fz_throw("cannot parse indirect object (%d %d R)", num, gen); case PDF_TENDOBJ: obj = fz_newnull(); goto skip; default: return fz_throw("cannot parse indirect object (%d %d R)", num, gen); } error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(obj); return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TSTREAM) { int c = fz_readbyte(file); while (c == ' ') c = fz_readbyte(file); if (c == '\r') { c = fz_peekbyte(file); if (c != '\n') fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen); else fz_readbyte(file); } stmofs = fz_tell(file); } else if (tok == PDF_TENDOBJ) { stmofs = 0; } else { fz_warn("expected endobj or stream keyword (%d %d R)", num, gen); stmofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stmofs; *op = obj; return fz_okay; }
fz_error pdf_parsedict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; pdf_token_e tok; int len; int a, b; dict = fz_newdict(8); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } skip: if (tok == PDF_TCDICT) { *op = dict; return fz_okay; } /* for BI .. ID .. EI in content streams */ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID")) { *op = dict; return fz_okay; } if (tok != PDF_TNAME) { fz_dropobj(dict); return fz_throw("invalid key in dict");; } key = fz_newname(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&val, xref, file, buf, cap); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TODICT: error = pdf_parsedict(&val, xref, file, buf, cap); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TNAME: val = fz_newname(buf); break; case PDF_TREAL: val = fz_newreal(atof(buf)); break; case PDF_TSTRING: val = fz_newstring(buf, len); break; case PDF_TTRUE: val = fz_newbool(1); break; case PDF_TFALSE: val = fz_newbool(0); break; case PDF_TNULL: val = fz_newnull(); break; case PDF_TINT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = (int) strtoll(buf, 0, 10); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TCDICT || tok == PDF_TNAME || (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { val = fz_newint(a); fz_dictput(dict, key, val); fz_dropobj(val); fz_dropobj(key); goto skip; } if (tok == PDF_TINT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(key); fz_dropobj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TR) { val = fz_newindirect(a, b, xref); break; } } fz_dropobj(key); fz_dropobj(dict); return fz_throw("invalid indirect reference in dict"); default: return fz_throw("unknown token in dict"); } fz_dictput(dict, key, val); fz_dropobj(val); fz_dropobj(key); } }
fz_error pdf_parsearray(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *ary = nil; fz_obj *obj = nil; int a = 0, b = 0, n = 0; pdf_token_e tok; int len; ary = fz_newarray(4); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_dropobj(ary); return fz_rethrow(error, "cannot parse array"); } if (tok != PDF_TINT && tok != PDF_TR) { if (n > 0) { obj = fz_newint(a); fz_arraypush(ary, obj); fz_dropobj(obj); } if (n > 1) { obj = fz_newint(b); fz_arraypush(ary, obj); fz_dropobj(obj); } n = 0; } if (tok == PDF_TINT && n == 2) { obj = fz_newint(a); fz_arraypush(ary, obj); fz_dropobj(obj); a = b; n --; } switch (tok) { case PDF_TCARRAY: *op = ary; return fz_okay; case PDF_TINT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TR: if (n != 2) { fz_dropobj(ary); return fz_throw("cannot parse indirect reference in array"); } obj = fz_newindirect(a, b, xref); fz_arraypush(ary, obj); fz_dropobj(obj); n = 0; break; case PDF_TOARRAY: error = pdf_parsearray(&obj, xref, file, buf, cap); if (error) { fz_dropobj(ary); return fz_rethrow(error, "cannot parse array"); } fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TODICT: error = pdf_parsedict(&obj, xref, file, buf, cap); if (error) { fz_dropobj(ary); return fz_rethrow(error, "cannot parse array"); } fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TNAME: obj = fz_newname(buf); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TREAL: obj = fz_newreal(atof(buf)); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TSTRING: obj = fz_newstring(buf, len); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TTRUE: obj = fz_newbool(1); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TFALSE: obj = fz_newbool(0); fz_arraypush(ary, obj); fz_dropobj(obj); break; case PDF_TNULL: obj = fz_newnull(); fz_arraypush(ary, obj); fz_dropobj(obj); break; default: fz_dropobj(ary); return fz_throw("cannot parse token in array"); } } }
static void saveimage(fz_obj *obj, int num, int gen) { pdf_image *img = nil; fz_obj *ref; fz_error error; fz_pixmap *pix; char name[1024]; FILE *f; int bpc; int w; int h; int n; int x; int y; error = fz_newindirect(&ref, num, gen, xref); if (error) die(error); error = pdf_newstore(&xref->store); if (error) die(error); error = pdf_loadimage(&img, xref, ref); if (error) die(error); n = img->super.n; w = img->super.w; h = img->super.h; bpc = img->bpc; error = fz_newpixmap(&pix, 0, 0, w, h, n + 1); if (error) die(error); error = img->super.loadtile(&img->super, pix); if (error) die(error); if (bpc == 1 && n == 0) { fz_pixmap *temp; error = fz_newpixmap(&temp, pix->x, pix->y, pix->w, pix->h, pdf_devicergb->n + 1); if (error) die(error); for (y = 0; y < pix->h; y++) for (x = 0; x < pix->w; x++) { int pixel = y * pix->w + x; temp->samples[pixel * temp->n + 0] = 255; temp->samples[pixel * temp->n + 1] = pix->samples[pixel]; temp->samples[pixel * temp->n + 2] = pix->samples[pixel]; temp->samples[pixel * temp->n + 3] = pix->samples[pixel]; } fz_droppixmap(pix); pix = temp; } if (img->super.cs && strcmp(img->super.cs->name, "DeviceRGB")) { fz_pixmap *temp; error = fz_newpixmap(&temp, pix->x, pix->y, pix->w, pix->h, pdf_devicergb->n + 1); if (error) die(error); fz_convertpixmap(img->super.cs, pix, pdf_devicergb, temp); fz_droppixmap(pix); pix = temp; } sprintf(name, "img-%04d.pnm", num); f = fopen(name, "wb"); if (f == NULL) die(fz_throw("Error creating image file")); fprintf(f, "P6\n%d %d\n%d\n", w, h, 255); for (y = 0; y < pix->h; y++) for (x = 0; x < pix->w; x++) { fz_sample *sample = &pix->samples[(y * pix->w + x) * (pdf_devicergb->n + 1)]; unsigned char r = sample[1]; unsigned char g = sample[2]; unsigned char b = sample[3]; fprintf(f, "%c%c%c", r, g, b); } if (fclose(f) < 0) die(fz_throw("Error closing image file")); fz_droppixmap(pix); pdf_dropstore(xref->store); xref->store = nil; fz_dropimage(&img->super); fz_dropobj(ref); }
void editflushcatalog(void) { fz_error *error; int rootnum, rootgen; int listnum, listgen; fz_obj *listref; fz_obj *obj; int i; /* Create page tree and add back-links */ error = pdf_allocobject(editxref, &listnum, &listgen); if (error) die(error); error = fz_packobj(&obj, "<</Type/Pages/Count %i/Kids %o>>", fz_arraylen(editpagelist), editpagelist); if (error) die(error); pdf_updateobject(editxref, listnum, listgen, obj); fz_dropobj(obj); error = fz_newindirect(&listref, listnum, listgen); if (error) die(error); for (i = 0; i < fz_arraylen(editpagelist); i++) { int num = fz_tonum(fz_arrayget(editpagelist, i)); int gen = fz_togen(fz_arrayget(editpagelist, i)); error = pdf_loadobject(&obj, editxref, num, gen); if (error) die(error); error = fz_dictputs(obj, "Parent", listref); if (error) die(error); pdf_updateobject(editxref, num, gen, obj); fz_dropobj(obj); } /* Create catalog */ error = pdf_allocobject(editxref, &rootnum, &rootgen); if (error) die(error); error = fz_packobj(&obj, "<</Type/Catalog/Pages %r>>", listnum, listgen); if (error) die(error); pdf_updateobject(editxref, rootnum, rootgen, obj); fz_dropobj(obj); /* Create trailer */ error = fz_packobj(&editxref->trailer, "<</Root %r>>", rootnum, rootgen); if (error) die(error); }
fz_error * pdf_parseindobj(fz_obj **op, fz_stream *file, char *buf, int cap, int *ooid, int *ogid, int *ostmofs) { fz_error *error = nil; fz_obj *obj = nil; int oid = 0, gid = 0, stmofs; int tok, len; int a, b; tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TINT) goto cleanup; oid = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TINT) goto cleanup; gid = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TOBJ) goto cleanup; tok = pdf_lex(file, buf, cap, &len); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&obj, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&obj, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&obj, buf); break; case PDF_TREAL: error = fz_newreal(&obj, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&obj, buf, len); break; case PDF_TTRUE: error = fz_newbool(&obj, 1); break; case PDF_TFALSE: error = fz_newbool(&obj, 0); break; case PDF_TNULL: error = fz_newnull(&obj); break; case PDF_TINT: a = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TSTREAM || tok == PDF_TENDOBJ) { error = fz_newint(&obj, a); if (error) goto cleanup; goto skip; } if (tok == PDF_TINT) { b = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TR) { error = fz_newindirect(&obj, a, b); break; } } goto cleanup; default: goto cleanup; } if (error) goto cleanup; tok = pdf_lex(file, buf, cap, &len); skip: if (tok == PDF_TSTREAM) { int c = fz_readbyte(file); if (c == '\r') { c = fz_peekbyte(file); if (c != '\n') fz_warn("syntaxerror: DOS format line ending after stream keyword (%d %d)\n", oid, gid); else c = fz_readbyte(file); } stmofs = fz_tell(file); } else if (tok == PDF_TENDOBJ) stmofs = 0; else goto cleanup; if (ooid) *ooid = oid; if (ogid) *ogid = gid; if (ostmofs) *ostmofs = stmofs; *op = obj; return nil; cleanup: if (obj) fz_dropobj(obj); if (error) return error; return fz_throw("syntaxerror: corrupt indirect object (%d %d)", oid, gid); }
fz_error * pdf_parsedict(fz_obj **op, fz_stream *file, char *buf, int cap) { fz_error *error = nil; fz_obj *dict = nil; fz_obj *key = nil; fz_obj *val = nil; int tok, len; int a, b; error = fz_newdict(op, 8); if (error) return error; dict = *op; while (1) { tok = pdf_lex(file, buf, cap, &len); skip: if (tok == PDF_TCDICT) return nil; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID")) return nil; if (tok != PDF_TNAME) goto cleanup; error = fz_newname(&key, buf); if (error) goto cleanup; tok = pdf_lex(file, buf, cap, &len); switch (tok) { case PDF_TOARRAY: error = pdf_parsearray(&val, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&val, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&val, buf); break; case PDF_TREAL: error = fz_newreal(&val, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&val, buf, len); break; case PDF_TTRUE: error = fz_newbool(&val, 1); break; case PDF_TFALSE: error = fz_newbool(&val, 0); break; case PDF_TNULL: error = fz_newnull(&val); break; case PDF_TINT: a = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TCDICT || tok == PDF_TNAME || (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { error = fz_newint(&val, a); if (error) goto cleanup; error = fz_dictput(dict, key, val); if (error) goto cleanup; fz_dropobj(val); fz_dropobj(key); key = val = nil; goto skip; } if (tok == PDF_TINT) { b = atoi(buf); tok = pdf_lex(file, buf, cap, &len); if (tok == PDF_TR) { error = fz_newindirect(&val, a, b); break; } } goto cleanup; default: goto cleanup; } if (error) goto cleanup; error = fz_dictput(dict, key, val); if (error) goto cleanup; fz_dropobj(val); fz_dropobj(key); key = val = nil; } cleanup: if (key) fz_dropobj(key); if (val) fz_dropobj(val); if (dict) fz_dropobj(dict); if (error) return error; return fz_throw("syntaxerror: corrupt dictionary"); }
fz_error * pdf_parsearray(fz_obj **op, fz_stream *file, char *buf, int cap) { fz_error *error = nil; fz_obj *ary = nil; fz_obj *obj = nil; int a = 0, b = 0, n = 0; int tok, len; error = fz_newarray(op, 4); if (error) return error; ary = *op; while (1) { tok = pdf_lex(file, buf, cap, &len); if (tok != PDF_TINT && tok != PDF_TR) { if (n > 0) { error = fz_newint(&obj, a); if (error) goto cleanup; error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); obj = nil; } if (n > 1) { error = fz_newint(&obj, b); if (error) goto cleanup; error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); obj = nil; } n = 0; } if (tok == PDF_TINT && n == 2) { error = fz_newint(&obj, a); if (error) goto cleanup; error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); obj = nil; a = b; n --; } switch (tok) { case PDF_TCARRAY: return nil; case PDF_TINT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TR: if (n != 2) goto cleanup; error = fz_newindirect(&obj, a, b); if (error) goto cleanup; n = 0; break; case PDF_TOARRAY: error = pdf_parsearray(&obj, file, buf, cap); break; case PDF_TODICT: error = pdf_parsedict(&obj, file, buf, cap); break; case PDF_TNAME: error = fz_newname(&obj, buf); break; case PDF_TREAL: error = fz_newreal(&obj, atof(buf)); break; case PDF_TSTRING: error = fz_newstring(&obj, buf, len); break; case PDF_TTRUE: error = fz_newbool(&obj, 1); break; case PDF_TFALSE: error = fz_newbool(&obj, 0); break; case PDF_TNULL: error = fz_newnull(&obj); break; default: goto cleanup; } if (error) goto cleanup; if (obj) { error = fz_arraypush(ary, obj); if (error) goto cleanup; fz_dropobj(obj); } obj = nil; } cleanup: if (obj) fz_dropobj(obj); if (ary) fz_dropobj(ary); if (error) return error; return fz_throw("syntaxerror: corrupt array"); }
fz_error pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) { fz_error error; fz_obj *dict, *obj; fz_obj *length; fz_obj *encrypt = nil; fz_obj *id = nil; fz_obj *root = nil; fz_obj *info = nil; struct entry *list = nil; int listlen; int listcap; int maxnum = 0; int num = 0; int gen = 0; int tmpofs, numofs = 0, genofs = 0; int stmlen, stmofs = 0; int tok; int next; int i, n; pdf_logxref("repairxref %p\n", xref); fz_seek(xref->file, 0, 0); listlen = 0; listcap = 1024; list = fz_calloc(listcap, sizeof(struct entry)); /* look for '%PDF' version marker within first kilobyte of file */ n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024)); if (n < 0) { error = fz_rethrow(n, "cannot read from file"); goto cleanup; } fz_seek(xref->file, 0, 0); for (i = 0; i < n - 4; i++) { if (memcmp(buf + i, "%PDF", 4) == 0) { fz_seek(xref->file, i, 0); break; } } while (1) { tmpofs = fz_tell(xref->file); if (tmpofs < 0) { error = fz_throw("cannot tell in file"); goto cleanup; } error = pdf_lex(&tok, xref->file, buf, bufsize, &n); if (error) { fz_catch(error, "ignoring the rest of the file"); break; } if (tok == PDF_TINT) { numofs = genofs; num = gen; genofs = tmpofs; gen = atoi(buf); } if (tok == PDF_TOBJ) { error = fz_repairobj(xref->file, buf, bufsize, &stmofs, &stmlen, &encrypt, &id); if (error) { error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen); goto cleanup; } pdf_logxref("found object: (%d %d R)\n", num, gen); if (listlen + 1 == listcap) { listcap = (listcap * 3) / 2; list = fz_realloc(list, listcap, sizeof(struct entry)); } list[listlen].num = num; list[listlen].gen = gen; list[listlen].ofs = numofs; list[listlen].stmofs = stmofs; list[listlen].stmlen = stmlen; listlen ++; if (num > maxnum) maxnum = num; } /* trailer dictionary */ if (tok == PDF_TODICT) { error = pdf_parsedict(&dict, xref, xref->file, buf, bufsize); if (error) { error = fz_rethrow(error, "cannot parse object"); goto cleanup; } obj = fz_dictgets(dict, "Encrypt"); if (obj) { if (encrypt) fz_dropobj(encrypt); encrypt = fz_keepobj(obj); } obj = fz_dictgets(dict, "ID"); if (obj) { if (id) fz_dropobj(id); id = fz_keepobj(obj); } obj = fz_dictgets(dict, "Root"); if (obj) { if (root) fz_dropobj(root); root = fz_keepobj(obj); } obj = fz_dictgets(dict, "Info"); if (obj) { if (info) fz_dropobj(info); info = fz_keepobj(obj); } fz_dropobj(dict); } if (tok == PDF_TERROR) fz_readbyte(xref->file); if (tok == PDF_TEOF) break; } /* make xref reasonable */ pdf_resizexref(xref, maxnum + 1); for (i = 0; i < listlen; i++) { xref->table[list[i].num].type = 'n'; xref->table[list[i].num].ofs = list[i].ofs; xref->table[list[i].num].gen = list[i].gen; xref->table[list[i].num].stmofs = list[i].stmofs; /* corrected stream length */ if (list[i].stmlen >= 0) { pdf_logxref("correct stream length %d %d = %d\n", list[i].num, list[i].gen, list[i].stmlen); error = pdf_loadobject(&dict, xref, list[i].num, list[i].gen); if (error) { error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen); goto cleanup; } length = fz_newint(list[i].stmlen); fz_dictputs(dict, "Length", length); fz_dropobj(length); fz_dropobj(dict); } } xref->table[0].type = 'f'; xref->table[0].ofs = 0; xref->table[0].gen = 65535; xref->table[0].stmofs = 0; xref->table[0].obj = nil; next = 0; for (i = xref->len - 1; i >= 0; i--) { if (xref->table[i].type == 'f') { xref->table[i].ofs = next; if (xref->table[i].gen < 65535) xref->table[i].gen ++; next = i; } } /* create a repaired trailer, Root will be added later */ xref->trailer = fz_newdict(5); obj = fz_newint(maxnum + 1); fz_dictputs(xref->trailer, "Size", obj); fz_dropobj(obj); if (root) { fz_dictputs(xref->trailer, "Root", root); fz_dropobj(root); } if (info) { fz_dictputs(xref->trailer, "Info", info); fz_dropobj(info); } if (encrypt) { if (fz_isindirect(encrypt)) { /* create new reference with non-nil xref pointer */ obj = fz_newindirect(fz_tonum(encrypt), fz_togen(encrypt), xref); fz_dropobj(encrypt); encrypt = obj; } fz_dictputs(xref->trailer, "Encrypt", encrypt); fz_dropobj(encrypt); } if (id) { if (fz_isindirect(id)) { /* create new reference with non-nil xref pointer */ obj = fz_newindirect(fz_tonum(id), fz_togen(id), xref); fz_dropobj(id); id = obj; } fz_dictputs(xref->trailer, "ID", id); fz_dropobj(id); } fz_free(list); return fz_okay; cleanup: if (encrypt) fz_dropobj(encrypt); if (id) fz_dropobj(id); if (root) fz_dropobj(root); if (info) fz_dropobj(info); fz_free(list); return error; /* already rethrown */ }