pdf_obj * pdf_parse_stm_obj(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) { pdf_token tok; fz_context *ctx = file->ctx; tok = pdf_lex(file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: return pdf_parse_array(doc, file, buf); case PDF_TOK_OPEN_DICT: return pdf_parse_dict(doc, file, buf); case PDF_TOK_NAME: return pdf_new_name(doc, buf->scratch); break; case PDF_TOK_REAL: return pdf_new_real(doc, buf->f); break; case PDF_TOK_STRING: return pdf_new_string(doc, buf->scratch, buf->len); break; case PDF_TOK_TRUE: return pdf_new_bool(doc, 1); break; case PDF_TOK_FALSE: return pdf_new_bool(doc, 0); break; case PDF_TOK_NULL: return pdf_new_null(doc); break; case PDF_TOK_INT: return pdf_new_int(doc, buf->i); break; default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream"); } return NULL; /* Stupid MSVC */ }
static fz_image * parse_inline_image(fz_context *ctx, pdf_csi *csi, fz_stream *stm) { pdf_document *doc = csi->doc; pdf_obj *rdb = csi->rdb; pdf_obj *obj = NULL; fz_image *img = NULL; int ch, found; fz_var(obj); fz_var(img); fz_try(ctx) { obj = pdf_parse_dict(ctx, doc, stm, &doc->lexbuf.base); /* read whitespace after ID keyword */ ch = fz_read_byte(ctx, stm); if (ch == '\r') if (fz_peek_byte(ctx, stm) == '\n') fz_read_byte(ctx, stm); img = pdf_load_inline_image(ctx, doc, rdb, obj, stm); /* find EI */ found = 0; ch = fz_read_byte(ctx, stm); do { while (ch != 'E' && ch != EOF) ch = fz_read_byte(ctx, stm); if (ch == 'E') { ch = fz_read_byte(ctx, stm); if (ch == 'I') { ch = fz_peek_byte(ctx, stm); if (ch == ' ' || ch <= 32 || ch == EOF || ch == '<' || ch == '/') { found = 1; break; } } } } while (ch != EOF); if (!found) fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error after inline image"); } fz_always(ctx) { pdf_drop_obj(ctx, obj); } fz_catch(ctx) { fz_drop_image(ctx, img); fz_rethrow(ctx); } return img; }
fz_error pdf_parse_stm_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error; int tok; int len; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse token in object stream"); switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(op, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object stream"); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(op, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse object stream"); break; case PDF_TOK_NAME: *op = fz_new_name(buf); break; case PDF_TOK_REAL: *op = fz_new_real(fz_atof(buf)); break; case PDF_TOK_STRING: *op = fz_new_string(buf, len); break; case PDF_TOK_TRUE: *op = fz_new_bool(1); break; case PDF_TOK_FALSE: *op = fz_new_bool(0); break; case PDF_TOK_NULL: *op = fz_new_null(); break; case PDF_TOK_INT: *op = fz_new_int(atoi(buf)); break; default: return fz_throw("unknown token in object stream"); } return fz_okay; }
static fz_error pdf_read_old_trailer(pdf_xref *xref, char *buf, int cap) { fz_error error; int len; char *s; int n; int t; int tok; int c; fz_read_line(xref->file, buf, cap); if (strncmp(buf, "xref", 4) != 0) return fz_throw("cannot find xref marker"); while (1) { c = fz_peek_byte(xref->file); if (!(c >= '0' && c <= '9')) break; fz_read_line(xref->file, buf, cap); s = buf; fz_strsep(&s, " "); /* ignore ofs */ if (!s) return fz_throw("invalid range marker in xref"); len = atoi(fz_strsep(&s, " ")); /* broken pdfs where the section is not on a separate line */ if (s && *s != '\0') fz_seek(xref->file, -(2 + (int)strlen(s)), 1); t = fz_tell(xref->file); if (t < 0) return fz_throw("cannot tell in file"); fz_seek(xref->file, t + 20 * len, 0); } error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TOK_TRAILER) return fz_throw("expected trailer marker"); error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TOK_OPEN_DICT) return fz_throw("expected trailer dictionary"); error = pdf_parse_dict(&xref->trailer, xref, xref->file, buf, cap); if (error) return fz_rethrow(error, "cannot parse trailer"); return fz_okay; }
pdf_obj * pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) { int tok; fz_context *ctx = file->ctx; tok = pdf_lex(file, buf); /* RJW: "cannot parse token in object stream") */ switch (tok) { case PDF_TOK_OPEN_ARRAY: return pdf_parse_array(xref, file, buf); /* RJW: "cannot parse object stream" */ case PDF_TOK_OPEN_DICT: return pdf_parse_dict(xref, file, buf); /* RJW: "cannot parse object stream" */ case PDF_TOK_NAME: return fz_new_name(ctx, buf->scratch); break; case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break; case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break; case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break; case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break; case PDF_TOK_NULL: return pdf_new_null(ctx); break; case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break; default: fz_throw(ctx, "unknown token in object stream"); } return NULL; /* Stupid MSVC */ }
pdf_obj * pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) { pdf_obj *dict; pdf_obj *key = NULL; pdf_obj *val = NULL; int tok; int a, b; fz_context *ctx = file->ctx; dict = pdf_new_dict(ctx, 8); fz_var(key); fz_var(val); fz_try(ctx) { while (1) { tok = pdf_lex(file, buf); skip: if (tok == PDF_TOK_CLOSE_DICT) break; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")) break; if (tok != PDF_TOK_NAME) fz_throw(ctx, "invalid key in dict"); key = fz_new_name(ctx, buf->scratch); tok = pdf_lex(file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: /* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1643 */ fz_try(ctx) { val = pdf_parse_array(xref, file, buf); } fz_catch(ctx) { fz_warn(ctx, "ignoring broken array for '%s'", pdf_to_name(key)); pdf_drop_obj(key); val = key = NULL; do tok = pdf_lex(file, buf); while (tok != PDF_TOK_CLOSE_DICT && tok != PDF_TOK_CLOSE_ARRAY && tok != PDF_TOK_EOF && tok != PDF_TOK_OPEN_ARRAY && tok != PDF_TOK_OPEN_DICT); if (tok == PDF_TOK_CLOSE_DICT) goto skip; if (tok == PDF_TOK_CLOSE_ARRAY) continue; fz_throw(ctx, "cannot make sense of broken array after all"); } break; case PDF_TOK_OPEN_DICT: val = pdf_parse_dict(xref, file, buf); break; case PDF_TOK_NAME: val = fz_new_name(ctx, buf->scratch); break; case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break; case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break; case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break; case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break; case PDF_TOK_NULL: val = pdf_new_null(ctx); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) { val = pdf_new_int(ctx, a); fz_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_R) { val = pdf_new_indirect(ctx, a, b, xref); break; } } fz_throw(ctx, "invalid indirect reference in dict"); default: fz_throw(ctx, "unknown token in dict"); } fz_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; } } fz_catch(ctx) { pdf_drop_obj(dict); pdf_drop_obj(key); pdf_drop_obj(val); fz_throw(ctx, "cannot parse dict"); } return dict; }
pdf_obj * pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) { pdf_obj *ary = NULL; pdf_obj *obj = NULL; int a = 0, b = 0, n = 0; int tok; fz_context *ctx = file->ctx; pdf_obj *op; fz_var(obj); ary = pdf_new_array(ctx, 4); fz_try(ctx) { while (1) { tok = pdf_lex(file, buf); if (tok != PDF_TOK_INT && tok != PDF_TOK_R) { if (n > 0) { obj = pdf_new_int(ctx, a); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; } if (n > 1) { obj = pdf_new_int(ctx, b); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; } n = 0; } if (tok == PDF_TOK_INT && n == 2) { obj = pdf_new_int(ctx, a); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; a = b; n --; } switch (tok) { case PDF_TOK_CLOSE_ARRAY: op = ary; goto end; case PDF_TOK_INT: if (n == 0) a = buf->i; if (n == 1) b = buf->i; n ++; break; case PDF_TOK_R: if (n != 2) fz_throw(ctx, "cannot parse indirect reference in array"); obj = pdf_new_indirect(ctx, a, b, xref); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; n = 0; break; case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(xref, file, buf); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(xref, file, buf); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_NAME: obj = fz_new_name(ctx, buf->scratch); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; case PDF_TOK_NULL: obj = pdf_new_null(ctx); pdf_array_push(ary, obj); pdf_drop_obj(obj); obj = NULL; break; default: fz_throw(ctx, "cannot parse token in array"); } } end: {} } fz_catch(ctx) { pdf_drop_obj(obj); pdf_drop_obj(ary); fz_throw(ctx, "cannot parse array"); } return op; }
struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar) { struct pdf_array *res=NULL; struct pdf_array_node *node=NULL; const char *objstart; char *end; int in_string=0, ninner=0; /* Sanity checking */ if (!(pdf) || !(obj) || !(begin)) return NULL; objstart = obj->start + pdf->map; if (begin < objstart || (size_t)(begin - objstart) >= objsz) return NULL; if (begin[0] != '[') return NULL; /* Find the end of the array */ end = begin; while ((size_t)(end - objstart) < objsz) { if (in_string) { if (*end == '\\') { end += 2; continue; } if (*end == ')') in_string = 0; end++; continue; } switch (*end) { case '(': in_string=1; break; case '[': ninner++; break; case ']': ninner--; break; } if (*end == ']' && ninner == 0) break; end++; } /* More sanity checking */ if ((size_t)(end - objstart) == objsz) return NULL; if (*end != ']') return NULL; res = cli_calloc(1, sizeof(struct pdf_array)); if (!(res)) return NULL; begin++; while (begin < end) { char *val=NULL, *p1; struct pdf_array *arr=NULL; struct pdf_dict *dict=NULL; while (begin < end && isspace(begin[0])) begin++; if (begin == end) break; switch (begin[0]) { case '<': if ((size_t)(begin - objstart) < objsz - 2 && begin[1] == '<') { dict = pdf_parse_dict(pdf, obj, objsz, begin, &begin); begin+=2; break; } /* Not a dictionary. Intentially fall through. */ case '(': val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin, NULL); begin += 2; break; case '[': /* XXX We should have a recursion counter here */ arr = pdf_parse_array(pdf, obj, objsz, begin, &begin); begin+=1; break; default: p1 = end; if (!is_object_reference(begin, &p1, NULL)) { p1 = begin+1; while (p1 < end && !isspace(p1[0])) p1++; } val = cli_calloc((p1 - begin) + 2, 1); if (!(val)) break; strncpy(val, begin, p1 - begin); val[p1 - begin] = '\0'; begin = p1; break; } /* Parse error, just return what we could */ if (!(val) && !(arr) && !(dict)) break; if (!(node)) { res->nodes = res->tail = node = calloc(1, sizeof(struct pdf_array_node)); if (!(node)) { if (dict) pdf_free_dict(dict); if (val) free(val); if (arr) pdf_free_array(arr); break; } } else { node = calloc(1, sizeof(struct pdf_array_node)); if (!(node)) { if (dict) pdf_free_dict(dict); if (val) free(val); if (arr) pdf_free_array(arr); break; } node->prev = res->tail; if (res->tail) res->tail->next = node; res->tail = node; } if (val != NULL) { node->type = PDF_ARR_STRING; node->data = val; node->datasz = strlen(val); } else if (dict != NULL) { node->type = PDF_ARR_DICT; node->data = dict; node->datasz = sizeof(struct pdf_dict); } else { node->type = PDF_ARR_ARRAY; node->data = arr; node->datasz = sizeof(struct pdf_array); } } if (endchar) *endchar = end; return res; }
struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar) { struct pdf_dict *res=NULL; struct pdf_dict_node *node=NULL; const char *objstart; char *end; unsigned int in_string=0, ninner=0; /* Sanity checking */ if (!(pdf) || !(obj) || !(begin)) return NULL; objstart = (const char *)(obj->start + pdf->map); if (begin < objstart || (size_t)(begin - objstart) >= objsz - 2) return NULL; if (begin[0] != '<' || begin[1] != '<') return NULL; /* Find the end of the dictionary */ end = begin; while ((size_t)(end - objstart) < objsz) { int increment=1; if (in_string) { if (*end == '\\') { end += 2; continue; } if (*end == ')') in_string = 0; end++; continue; } switch (*end) { case '(': in_string=1; break; case '<': if ((size_t)(end - objstart) <= objsz - 2 && end[1] == '<') ninner++; increment=2; break; case '>': if ((size_t)(end - objstart) <= objsz - 2 && end[1] == '>') ninner--; increment=2; break; } if ((size_t)(end - objstart) <= objsz - 2) if (end[0] == '>' && end[1] == '>' && ninner == 0) break; end += increment; } /* More sanity checking */ if ((size_t)(end - objstart) >= objsz - 2) return NULL; if (end[0] != '>' || end[1] != '>') return NULL; res = cli_calloc(1, sizeof(struct pdf_dict)); if (!(res)) return NULL; /* Loop through each element of the dictionary */ begin += 2; while (begin < end) { char *val=NULL, *key=NULL, *p1, *p2; struct pdf_dict *dict=NULL; struct pdf_array *arr=NULL; unsigned int nhex=0, i; /* Skip any whitespaces */ while (begin < end && isspace(begin[0])) begin++; if (begin == end) break; /* Get the key */ p1 = begin+1; while (p1 < end && !isspace(p1[0])) { int breakout=0; switch (*p1) { case '<': case '[': case '(': case '/': case '\r': case '\n': case ' ': case '\t': breakout=1; break; case '#': /* Key name obfuscated with hex characters */ nhex++; if (p1 > end-3) { return res; } break; } if (breakout) break; p1++; } if (p1 == end) break; key = cli_calloc((p1 - begin) + 2, 1); if (!(key)) break; if (nhex == 0) { /* Key isn't obfuscated with hex. Just copy the string */ strncpy(key, begin, p1 - begin); key[p1 - begin] = '\0'; } else { for (i=0, p2 = begin; p2 < p1; p2++, i++) { if (*p2 == '#') { cli_hex2str_to(p2+1, key+i, 2); p2 += 2; } else { key[i] = *p2; } } } /* Now for the value */ begin = p1; /* Skip any whitespaces */ while (begin < end && isspace(begin[0])) begin++; if (begin == end) { free(key); break; } switch (begin[0]) { case '(': val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL); begin = p1+2; break; case '[': arr = pdf_parse_array(pdf, obj, objsz, begin, &p1); begin = p1+1; break; case '<': if ((size_t)(begin - objstart) < objsz - 2) { if (begin[1] == '<') { dict = pdf_parse_dict(pdf, obj, objsz, begin, &p1); begin = p1+2; break; } } val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL); begin = p1+2; break; default: p1 = (begin[0] == '/') ? begin+1 : begin; while (p1 < end) { int shouldbreak = 0; switch (p1[0]) { case '>': case '/': shouldbreak=1; break; } if (shouldbreak) break; p1++; } is_object_reference(begin, &p1, NULL); val = cli_calloc((p1 - begin) + 2, 1); if (!(val)) break; strncpy(val, begin, p1 - begin); val[p1 - begin] = '\0'; if (p1[0] != '/') begin = p1+1; else begin = p1; break; } if (!(val) && !(dict) && !(arr)) { free(key); break; } if (!(res->nodes)) { res->nodes = res->tail = node = cli_calloc(1, sizeof(struct pdf_dict_node)); if (!(node)) { free(key); if (dict) pdf_free_dict(dict); if (val) free(val); if (arr) pdf_free_array(arr); break; } } else { node = calloc(1, sizeof(struct pdf_dict_node)); if (!(node)) { free(key); if (dict) pdf_free_dict(dict); if (val) free(val); if (arr) pdf_free_array(arr); break; } node->prev = res->tail; if (res->tail) res->tail->next = node; res->tail = node; } node->key = key; if ((val)) { node->value = val; node->valuesz = strlen(val); node->type = PDF_DICT_STRING; } else if ((arr)) { node->value = arr; node->valuesz = sizeof(struct pdf_array); node->type = PDF_DICT_ARRAY; } else if ((dict)) { node->value = dict; node->valuesz = sizeof(struct pdf_dict); node->type = PDF_DICT_DICT; } } if (endchar) *endchar = end; return res; }
fz_error pdf_parse_dict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *dict = NULL; fz_obj *key = NULL; fz_obj *val = NULL; int tok; int len; int a, b; dict = fz_new_dict(8); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } skip: if (tok == PDF_TOK_CLOSE_DICT) { *op = dict; return fz_okay; } /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID")) { *op = dict; return fz_okay; } if (tok != PDF_TOK_NAME) { fz_drop_obj(dict); return fz_throw("invalid key in dict"); } key = fz_new_name(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&val, xref, file, buf, cap); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&val, xref, file, buf, cap); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } break; case PDF_TOK_NAME: val = fz_new_name(buf); break; case PDF_TOK_REAL: val = fz_new_real(fz_atof(buf)); break; case PDF_TOK_STRING: val = fz_new_string(buf, len); break; case PDF_TOK_TRUE: val = fz_new_bool(1); break; case PDF_TOK_FALSE: val = fz_new_bool(0); break; case PDF_TOK_NULL: val = fz_new_null(); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = (int) strtoll(buf, 0, 10); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID"))) { val = fz_new_int(a); fz_dict_put(dict, key, val); fz_drop_obj(val); fz_drop_obj(key); goto skip; } if (tok == PDF_TOK_INT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(key); fz_drop_obj(dict); return fz_rethrow(error, "cannot parse dict"); } if (tok == PDF_TOK_R) { val = fz_new_indirect(a, b, xref); break; } } fz_drop_obj(key); fz_drop_obj(dict); return fz_throw("invalid indirect reference in dict"); default: fz_drop_obj(key); fz_drop_obj(dict); return fz_throw("unknown token in dict"); } fz_dict_put(dict, key, val); fz_drop_obj(val); fz_drop_obj(key); } }
pdf_obj * pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf, int *onum, int *ogen, fz_off_t *ostmofs, int *try_repair) { pdf_obj *obj = NULL; int num = 0, gen = 0; fz_off_t stm_ofs; pdf_token tok; fz_off_t a, b; fz_var(obj); tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_INT) { if (try_repair) *try_repair = 1; fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number"); } num = buf->i; tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_INT) { if (try_repair) *try_repair = 1; fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num); } gen = buf->i; tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_OBJ) { if (try_repair) *try_repair = 1; fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen); } tok = pdf_lex(ctx, file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(ctx, doc, file, buf); break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(ctx, doc, file, buf); break; case PDF_TOK_NAME: obj = pdf_new_name(ctx, doc, buf->scratch); break; case PDF_TOK_REAL: obj = pdf_new_real(ctx, doc, buf->f); break; case PDF_TOK_STRING: obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); break; case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, doc, 1); break; case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, doc, 0); break; case PDF_TOK_NULL: obj = pdf_new_null(ctx, doc); break; case PDF_TOK_INT: a = buf->i; tok = pdf_lex(ctx, file, buf); if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = pdf_new_int_offset(ctx, doc, a); goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(ctx, file, buf); if (tok == PDF_TOK_R) { obj = pdf_new_indirect(ctx, doc, a, b); break; } } fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = pdf_new_null(ctx, doc); goto skip; default: fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen); } fz_try(ctx) { tok = pdf_lex(ctx, file, buf); } fz_catch(ctx) { pdf_drop_obj(ctx, obj); fz_rethrow(ctx); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(ctx, file); while (c == ' ') c = fz_read_byte(ctx, file); if (c == '\r') { c = fz_peek_byte(ctx, file); if (c != '\n') fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(ctx, file); } stm_ofs = fz_tell(ctx, file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; return obj; }
int pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root) { fz_stream *file = doc->file; pdf_token tok; int stm_len; *stmofsp = 0; if (stmlenp) *stmlenp = -1; stm_len = 0; /* On entry to this function, we know that we've just seen * '<int> <int> obj'. We expect the next thing we see to be a * pdf object. Regardless of the type of thing we meet next * we only need to fully parse it if it is a dictionary. */ tok = pdf_lex(ctx, file, buf); if (tok == PDF_TOK_OPEN_DICT) { pdf_obj *dict, *obj; fz_try(ctx) { dict = pdf_parse_dict(ctx, doc, file, buf); } fz_catch(ctx) { fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); /* Don't let a broken object at EOF overwrite a good one */ if (file->eof) fz_rethrow(ctx); /* Silently swallow the error */ dict = pdf_new_dict(ctx, NULL, 2); } /* We must be careful not to try to resolve any indirections * here. We have just read dict, so we know it to be a non * indirected dictionary. Before we look at any values that * we get back from looking up in it, we need to check they * aren't indirected. */ if (encrypt || id || root) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_XRef)) { if (encrypt) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt); if (obj) { pdf_drop_obj(ctx, *encrypt); *encrypt = pdf_keep_obj(ctx, obj); } } if (id) { obj = pdf_dict_get(ctx, dict, PDF_NAME_ID); if (obj) { pdf_drop_obj(ctx, *id); *id = pdf_keep_obj(ctx, obj); } } if (root) *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Root)); } } obj = pdf_dict_get(ctx, dict, PDF_NAME_Length); if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj)) stm_len = pdf_to_int(ctx, obj); if (doc->file_reading_linearly && page) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_Page)) { pdf_drop_obj(ctx, *page); *page = pdf_keep_obj(ctx, dict); } } pdf_drop_obj(ctx, dict); }
static fz_error pdf_read_old_xref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) { fz_error error; int ofs, len; char *s; int n; int tok; int i; int c; fz_read_line(xref->file, buf, cap); if (strncmp(buf, "xref", 4) != 0) return fz_throw("cannot find xref marker"); while (1) { c = fz_peek_byte(xref->file); if (!(c >= '0' && c <= '9')) break; fz_read_line(xref->file, buf, cap); s = buf; ofs = atoi(fz_strsep(&s, " ")); len = atoi(fz_strsep(&s, " ")); /* broken pdfs where the section is not on a separate line */ if (s && *s != '\0') { fz_warn("broken xref section. proceeding anyway."); fz_seek(xref->file, -(2 + (int)strlen(s)), 1); } /* broken pdfs where size in trailer undershoots entries in xref sections */ if (ofs + len > xref->len) { fz_warn("broken xref section, proceeding anyway."); pdf_resize_xref(xref, ofs + len); } for (i = ofs; i < ofs + len; i++) { n = fz_read(xref->file, (unsigned char *) buf, 20); if (n < 0) return fz_rethrow(n, "cannot read xref table"); if (!xref->table[i].type) { s = buf; /* broken pdfs where line start with white space */ while (*s != '\0' && iswhite(*s)) s++; xref->table[i].ofs = atoi(s); xref->table[i].gen = atoi(s + 11); xref->table[i].type = s[17]; if (s[17] != 'f' && s[17] != 'n' && s[17] != 'o') return fz_throw("unexpected xref type: %#x (%d %d R)", s[17], i, xref->table[i].gen); } } } error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TOK_TRAILER) return fz_throw("expected trailer marker"); error = pdf_lex(&tok, xref->file, buf, cap, &n); if (error) return fz_rethrow(error, "cannot parse trailer"); if (tok != PDF_TOK_OPEN_DICT) return fz_throw("expected trailer dictionary"); error = pdf_parse_dict(trailerp, xref, xref->file, buf, cap); if (error) return fz_rethrow(error, "cannot parse trailer"); return fz_okay; }
fz_error pdf_parse_ind_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap, int *onum, int *ogen, int *ostmofs) { fz_error error = fz_okay; fz_obj *obj = NULL; int num = 0, gen = 0, stm_ofs; int tok; int len; int a, b; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_INT) return fz_throw("expected object number (%d %d R)", num, gen); num = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_INT) return fz_throw("expected generation number (%d %d R)", num, gen); gen = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_OBJ) return fz_throw("expected 'obj' keyword (%d %d R)", num, gen); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&obj, xref, file, buf, cap); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TOK_NAME: obj = fz_new_name(buf); break; case PDF_TOK_REAL: obj = fz_new_real(fz_atof(buf)); break; case PDF_TOK_STRING: obj = fz_new_string(buf, len); break; case PDF_TOK_TRUE: obj = fz_new_bool(1); break; case PDF_TOK_FALSE: obj = fz_new_bool(0); break; case PDF_TOK_NULL: obj = fz_new_null(); break; case PDF_TOK_INT: a = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = fz_new_int(a); goto skip; } if (tok == PDF_TOK_INT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TOK_R) { obj = fz_new_indirect(a, b, xref); break; } } return fz_throw("expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = fz_new_null(); goto skip; default: return fz_throw("syntax error in object (%d %d R)", num, gen); } error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(obj); return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); while (c == ' ') c = fz_read_byte(file); if (c == '\r') { c = fz_peek_byte(file); if (c != '\n') fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(file); } stm_ofs = fz_tell(file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn("expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; *op = obj; return fz_okay; }
pdf_obj * pdf_parse_dict(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) { pdf_obj *dict; pdf_obj *key = NULL; pdf_obj *val = NULL; pdf_token tok; int a, b; fz_context *ctx = file->ctx; dict = pdf_new_dict(doc, 8); fz_var(key); fz_var(val); fz_try(ctx) { while (1) { tok = pdf_lex(file, buf); skip: if (tok == PDF_TOK_CLOSE_DICT) break; /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")) break; if (tok != PDF_TOK_NAME) fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict"); key = pdf_new_name(doc, buf->scratch); tok = pdf_lex(file, buf); switch (tok) { case PDF_TOK_OPEN_ARRAY: val = pdf_parse_array(doc, file, buf); break; case PDF_TOK_OPEN_DICT: val = pdf_parse_dict(doc, file, buf); break; case PDF_TOK_NAME: val = pdf_new_name(doc, buf->scratch); break; case PDF_TOK_REAL: val = pdf_new_real(doc, buf->f); break; case PDF_TOK_STRING: val = pdf_new_string(doc, buf->scratch, buf->len); break; case PDF_TOK_TRUE: val = pdf_new_bool(doc, 1); break; case PDF_TOK_FALSE: val = pdf_new_bool(doc, 0); break; case PDF_TOK_NULL: val = pdf_new_null(doc); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) { val = pdf_new_int(doc, a); pdf_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(file, buf); if (tok == PDF_TOK_R) { val = pdf_new_indirect(doc, a, b); break; } } fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict"); default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict"); } pdf_dict_put(dict, key, val); pdf_drop_obj(val); val = NULL; pdf_drop_obj(key); key = NULL; } } fz_catch(ctx) { pdf_drop_obj(dict); pdf_drop_obj(key); pdf_drop_obj(val); fz_rethrow_message(ctx, "cannot parse dict"); } return dict; }
pdf_obj * pdf_parse_ind_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf, int *onum, int *ogen, int *ostmofs) { pdf_obj *obj = NULL; int num = 0, gen = 0, stm_ofs; int tok; int a, b; fz_context *ctx = file->ctx; fz_var(obj); tok = pdf_lex(file, buf); /* RJW: cannot parse indirect object (%d %d R)", num, gen */ if (tok != PDF_TOK_INT) fz_throw(ctx, "expected object number (%d %d R)", num, gen); num = buf->i; tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ if (tok != PDF_TOK_INT) fz_throw(ctx, "expected generation number (%d %d R)", num, gen); gen = buf->i; tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ if (tok != PDF_TOK_OBJ) fz_throw(ctx, "expected 'obj' keyword (%d %d R)", num, gen); tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ switch (tok) { case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(xref, file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(xref, file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen */ break; case PDF_TOK_NAME: obj = fz_new_name(ctx, buf->scratch); break; case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break; case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break; case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break; case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break; case PDF_TOK_NULL: obj = pdf_new_null(ctx); break; case PDF_TOK_INT: a = buf->i; tok = pdf_lex(file, buf); /* "cannot parse indirect object (%d %d R)", num, gen */ if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = pdf_new_int(ctx, a); goto skip; } if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(file, buf); /* RJW: "cannot parse indirect object (%d %d R)", num, gen); */ if (tok == PDF_TOK_R) { obj = pdf_new_indirect(ctx, a, b, xref); break; } } fz_throw(ctx, "expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = pdf_new_null(ctx); goto skip; default: fz_throw(ctx, "syntax error in object (%d %d R)", num, gen); } fz_try(ctx) { tok = pdf_lex(file, buf); } fz_catch(ctx) { pdf_drop_obj(obj); fz_throw(ctx, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); while (c == ' ') c = fz_read_byte(file); if (c == '\r') { c = fz_peek_byte(file); if (c != '\n') fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(file); } stm_ofs = fz_tell(file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; return obj; }
fz_error pdf_parse_array(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *ary = NULL; fz_obj *obj = NULL; int a = 0, b = 0, n = 0; int tok; int len; ary = fz_new_array(4); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ary); return fz_rethrow(error, "cannot parse array"); } if (tok != PDF_TOK_INT && tok != PDF_TOK_R) { if (n > 0) { obj = fz_new_int(a); fz_array_push(ary, obj); fz_drop_obj(obj); } if (n > 1) { obj = fz_new_int(b); fz_array_push(ary, obj); fz_drop_obj(obj); } n = 0; } if (tok == PDF_TOK_INT && n == 2) { obj = fz_new_int(a); fz_array_push(ary, obj); fz_drop_obj(obj); a = b; n --; } switch (tok) { case PDF_TOK_CLOSE_ARRAY: *op = ary; return fz_okay; case PDF_TOK_INT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TOK_R: if (n != 2) { fz_drop_obj(ary); return fz_throw("cannot parse indirect reference in array"); } obj = fz_new_indirect(a, b, xref); fz_array_push(ary, obj); fz_drop_obj(obj); n = 0; break; case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&obj, xref, file, buf, cap); if (error) { fz_drop_obj(ary); return fz_rethrow(error, "cannot parse array"); } fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&obj, xref, file, buf, cap); if (error) { fz_drop_obj(ary); return fz_rethrow(error, "cannot parse array"); } fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_NAME: obj = fz_new_name(buf); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_REAL: obj = fz_new_real(fz_atof(buf)); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_STRING: obj = fz_new_string(buf, len); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_TRUE: obj = fz_new_bool(1); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_FALSE: obj = fz_new_bool(0); fz_array_push(ary, obj); fz_drop_obj(obj); break; case PDF_TOK_NULL: obj = fz_new_null(); fz_array_push(ary, obj); fz_drop_obj(obj); break; default: fz_drop_obj(ary); return fz_throw("cannot parse token in array"); } } }