char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len) { char *wrkstr, *output = NULL; size_t wrklen = len, outlen; unsigned int i, likelyutf = 0; if (!in) return NULL; /* get a working copy */ wrkstr = cli_calloc(len+1, sizeof(char)); if (!wrkstr) return NULL; memcpy(wrkstr, in, len); //cli_errmsg("pdf_final: start(%d): %s\n", wrklen, wrkstr); /* convert PDF specific escape sequences, like octal sequences */ /* TODO: replace the escape sequences directly in the wrkstr */ if (strchr(wrkstr, '\\')) { output = cli_calloc(wrklen+1, sizeof(char)); if (!output) { free(wrkstr); return NULL; } outlen = 0; for (i = 0; i < wrklen; ++i) { if ((i+1 < wrklen) && wrkstr[i] == '\\') { if ((i+3 < wrklen) && (isdigit(wrkstr[i+1]) && isdigit(wrkstr[i+2]) && isdigit(wrkstr[i+3]))) { /* octal sequence */ char octal[4], *check; unsigned long value; memcpy(octal, &wrkstr[i+1], 3); octal[3] = '\0'; value = (char)strtoul(octal, &check, 8); /* check if all characters were converted */ if (check == &octal[3]) output[outlen++] = value; i += 3; /* 4 with for loop [\ddd] */ } else { /* other sequences */ switch(wrkstr[i+1]) { case 'n': output[outlen++] = 0x0a; break; case 'r': output[outlen++] = 0x0d; break; case 't': output[outlen++] = 0x09; break; case 'b': output[outlen++] = 0x08; break; case 'f': output[outlen++] = 0x0c; break; case '(': output[outlen++] = 0x28; break; case ')': output[outlen++] = 0x29; break; case '\\': output[outlen++] = 0x5c; break; default: /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */ break; } i += 1; /* 2 with for loop [\c] */ } } else { output[outlen++] = wrkstr[i]; } } free(wrkstr); wrkstr = cli_strdup(output); free(output); wrklen = outlen; } //cli_errmsg("pdf_final: escaped(%d): %s\n", wrklen, wrkstr); /* check for encryption and decrypt */ if (pdf->flags & (1 << ENCRYPTED_PDF)) { off_t tmpsz = (off_t)wrklen; output = pdf_decrypt_string(pdf, obj, wrkstr, &tmpsz); outlen = (size_t)tmpsz; free(wrkstr); if (output) { wrkstr = cli_calloc(outlen+1, sizeof(char)); if (!wrkstr) { free(output); return NULL; } memcpy(wrkstr, output, outlen); free(output); wrklen = outlen; } else { return NULL; } } //cli_errmsg("pdf_final: decrypt(%d): %s\n", wrklen, wrkstr); /* check for UTF-* and convert to UTF-8 */ for (i = 0; i < wrklen; ++i) { if (((unsigned char)wrkstr[i] > (unsigned char)0x7f) || (wrkstr[i] == '\0')) { likelyutf = 1; break; } } if (likelyutf) { output = pdf_convert_utf(wrkstr, wrklen); free(wrkstr); wrkstr = output; } //cli_errmsg("pdf_final: postutf(%d): %s\n", wrklen, wrkstr); return wrkstr; }
char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar) { const char *q = objstart; char *p1, *p2; size_t len, checklen; char *res; int likelyutf = 0; uint32_t objid; size_t i; /* * Yes, all of this is required to find the start and end of a potentially UTF-* string * * First, find the key of the key/value pair we're looking for in this object. * Second, determine whether the value points to another object (NOTE: this is sketchy behavior) * Third, attempt to determine if we're ASCII or UTF-* * If we're ASCII, just copy the ASCII string into a new heap-allocated string and return that * Fourth, Attempt to decode from UTF-* to UTF-8 */ res = NULL; if (str) { checklen = strlen(str); if (objsize < strlen(str) + 3) return NULL; for (p1=(char *)q; (size_t)(p1 - q) < objsize-checklen; p1++) if (!strncmp(p1, str, checklen)) break; if ((size_t)(p1 - q) == objsize - checklen) return NULL; p1 += checklen; } else { p1 = (char *)q; } while ((size_t)(p1 - q) < objsize && isspace(p1[0])) p1++; if ((size_t)(p1 - q) == objsize) return NULL; /* * If str is non-null: * We should be at the start of the string, minus 1 * Else: * We should be at the start of the string */ p2 = (char *)(q + objsize); if (is_object_reference(p1, &p2, &objid)) { struct pdf_obj *newobj; char *begin, *p3; STATBUF sb; uint32_t objflags; int fd; size_t objsize2; newobj = find_obj(pdf, obj, objid); if (!(newobj)) return NULL; if (newobj == obj) return NULL; /* * If pdf_handlename hasn't been called for this object, * then parse the object prior to extracting it */ if (!(newobj->statsflags & OBJ_FLAG_PDFNAME_DONE)) pdf_parseobj(pdf, newobj); /* Extract the object. Force pdf_extract_obj() to dump this object. */ objflags = newobj->flags; newobj->flags |= (1 << OBJ_FORCEDUMP); if (pdf_extract_obj(pdf, newobj, PDF_EXTRACT_OBJ_NONE) != CL_SUCCESS) return NULL; newobj->flags = objflags; if (!(newobj->path)) return NULL; fd = open(newobj->path, O_RDONLY); if (fd == -1) { cli_unlink(newobj->path); free(newobj->path); newobj->path = NULL; return NULL; } if (FSTAT(fd, &sb)) { close(fd); cli_unlink(newobj->path); free(newobj->path); newobj->path = NULL; return NULL; } if (sb.st_size) { begin = calloc(1, sb.st_size+1); if (!(begin)) { close(fd); cli_unlink(newobj->path); free(newobj->path); newobj->path = NULL; return NULL; } if (read(fd, begin, sb.st_size) != sb.st_size) { close(fd); cli_unlink(newobj->path); free(newobj->path); newobj->path = NULL; free(begin); return NULL; } p3 = begin; objsize2 = sb.st_size; while ((size_t)(p3 - begin) < objsize2 && isspace(p3[0])) { p3++; objsize2--; } switch (*p3) { case '(': case '<': res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL); free(begin); break; default: for (i=0; i < objsize2; i++) { if (p3[i] >= 0x7f) { likelyutf=1; break; } } res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL; if (!(res)) { res = begin; res[objsize2] = '\0'; } else { free(begin); } } } close(fd); cli_unlink(newobj->path); free(newobj->path); newobj->path = NULL; if (endchar) *endchar = p2; return res; } if (*p1 == '<') { /* Hex string */ p2 = p1+1; while ((size_t)(p2 - q) < objsize && *p2 != '>') p2++; if ((size_t)(p2 - q) == objsize) { return NULL; } res = cli_calloc(1, (p2 - p1) + 2); if (!(res)) return NULL; strncpy(res, p1, (p2 - p1) + 1); if (endchar) *endchar = p2; return res; } /* We should be at the start of a string literal (...) here */ if (*p1 != '(') return NULL; /* Make a best effort to find the end of the string and determine if UTF-* */ p2 = ++p1; while (p2 < objstart + objsize) { int shouldbreak=0; if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0')) likelyutf = 1; switch (*p2) { case '\\': p2++; break; case ')': shouldbreak=1; break; } if (shouldbreak) { p2--; break; } p2++; } if (p2 == objstart + objsize) return NULL; len = (size_t)(p2 - p1) + 1; if (likelyutf == 0) { /* We're not UTF-*, so just make a copy of the string and return that */ res = cli_calloc(1, len+1); if (!(res)) return NULL; memcpy(res, p1, len); res[len] = '\0'; if (endchar) *endchar = p2; return res; } res = pdf_convert_utf(p1, len); if (res && endchar) *endchar = p2; return res; }