Ejemplo n.º 1
0
char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len)
{
    char *wrkstr, *output = NULL;
    size_t wrklen = len, outlen;
    unsigned int i, likelyutf = 0;

    if (!in)
        return NULL;

    /* get a working copy */
    wrkstr = cli_calloc(len+1, sizeof(char));
    if (!wrkstr)
        return NULL;
    memcpy(wrkstr, in, len);

    //cli_errmsg("pdf_final: start(%d):   %s\n", wrklen, wrkstr);

    /* convert PDF specific escape sequences, like octal sequences */
    /* TODO: replace the escape sequences directly in the wrkstr   */
    if (strchr(wrkstr, '\\')) {
        output = cli_calloc(wrklen+1, sizeof(char));
        if (!output) {
            free(wrkstr);
            return NULL;
        }

        outlen = 0;
        for (i = 0; i < wrklen; ++i) {
            if ((i+1 < wrklen) && wrkstr[i] == '\\') {
                if ((i+3 < wrklen) &&
                    (isdigit(wrkstr[i+1]) && isdigit(wrkstr[i+2]) && isdigit(wrkstr[i+3]))) {
                    /* octal sequence */
                    char octal[4], *check;
                    unsigned long value;

                    memcpy(octal, &wrkstr[i+1], 3);
                    octal[3] = '\0';

                    value = (char)strtoul(octal, &check, 8);
                    /* check if all characters were converted */
                    if (check == &octal[3])
                        output[outlen++] = value;
                    i += 3; /* 4 with for loop [\ddd] */
                } else {
                    /* other sequences */
                    switch(wrkstr[i+1]) {
                    case 'n':
                        output[outlen++] = 0x0a;
                        break;
                    case 'r':
                        output[outlen++] = 0x0d;
                        break;
                    case 't':
                        output[outlen++] = 0x09;
                        break;
                    case 'b':
                        output[outlen++] = 0x08;
                        break;
                    case 'f':
                        output[outlen++] = 0x0c;
                        break;
                    case '(':
                        output[outlen++] = 0x28;
                        break;
                    case ')':
                        output[outlen++] = 0x29;
                        break;
                    case '\\':
                        output[outlen++] = 0x5c;
                        break;
                    default:
                        /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */
                        break;
                    }
                    i += 1; /* 2 with for loop [\c] */
                }
            } else {
                output[outlen++] = wrkstr[i];
            }
        }

        free(wrkstr);
        wrkstr = cli_strdup(output);
        free(output);
        wrklen = outlen;
    }

    //cli_errmsg("pdf_final: escaped(%d): %s\n", wrklen, wrkstr);

    /* check for encryption and decrypt */
    if (pdf->flags & (1 << ENCRYPTED_PDF))
    {
        off_t tmpsz = (off_t)wrklen;
        output = pdf_decrypt_string(pdf, obj, wrkstr, &tmpsz);
        outlen = (size_t)tmpsz;
        free(wrkstr);
        if (output) {
            wrkstr = cli_calloc(outlen+1, sizeof(char));
            if (!wrkstr) {
                free(output);
                return NULL;
            }
            memcpy(wrkstr, output, outlen);
            free(output);
            wrklen = outlen;
        } else {
            return NULL;
        }
    }

    //cli_errmsg("pdf_final: decrypt(%d): %s\n", wrklen, wrkstr);

    /* check for UTF-* and convert to UTF-8 */
    for (i = 0; i < wrklen; ++i) {
        if (((unsigned char)wrkstr[i] > (unsigned char)0x7f) || (wrkstr[i] == '\0')) {
            likelyutf = 1;
            break;
        }
    }

    if (likelyutf) {
        output = pdf_convert_utf(wrkstr, wrklen);
        free(wrkstr);
        wrkstr = output;
    }

    //cli_errmsg("pdf_final: postutf(%d): %s\n", wrklen, wrkstr);

    return wrkstr;
}
Ejemplo n.º 2
0
char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar)
{
    const char *q = objstart;
    char *p1, *p2;
    size_t len, checklen;
    char *res;
    int likelyutf = 0;
    uint32_t objid;
    size_t i;

    /*
     * Yes, all of this is required to find the start and end of a potentially UTF-* string
     *
     * First, find the key of the key/value pair we're looking for in this object.
     * Second, determine whether the value points to another object (NOTE: this is sketchy behavior)
     * Third, attempt to determine if we're ASCII or UTF-*
     * If we're ASCII, just copy the ASCII string into a new heap-allocated string and return that
     * Fourth, Attempt to decode from UTF-* to UTF-8
     */

    res = NULL;

    if (str) {
        checklen = strlen(str);

        if (objsize < strlen(str) + 3)
            return NULL;

        for (p1=(char *)q; (size_t)(p1 - q) < objsize-checklen; p1++)
            if (!strncmp(p1, str, checklen))
                break;

        if ((size_t)(p1 - q) == objsize - checklen)
            return NULL;

        p1 += checklen;
    } else {
        p1 = (char *)q;
    }

    while ((size_t)(p1 - q) < objsize && isspace(p1[0]))
        p1++;

    if ((size_t)(p1 - q) == objsize)
        return NULL;

    /*
     * If str is non-null:
     *     We should be at the start of the string, minus 1
     * Else:
     *     We should be at the start of the string
     */

    p2 = (char *)(q + objsize);
    if (is_object_reference(p1, &p2, &objid)) {
        struct pdf_obj *newobj;
        char *begin, *p3;
        STATBUF sb;
        uint32_t objflags;
        int fd;
        size_t objsize2;

        newobj = find_obj(pdf, obj, objid);
        if (!(newobj))
            return NULL;

        if (newobj == obj)
            return NULL;

        /* 
         * If pdf_handlename hasn't been called for this object,
         * then parse the object prior to extracting it
         */
        if (!(newobj->statsflags & OBJ_FLAG_PDFNAME_DONE))
            pdf_parseobj(pdf, newobj);

        /* Extract the object. Force pdf_extract_obj() to dump this object. */
        objflags = newobj->flags;
        newobj->flags |= (1 << OBJ_FORCEDUMP);

        if (pdf_extract_obj(pdf, newobj, PDF_EXTRACT_OBJ_NONE) != CL_SUCCESS)
            return NULL;

        newobj->flags = objflags;

        if (!(newobj->path))
            return NULL;

        fd = open(newobj->path, O_RDONLY);
        if (fd == -1) {
            cli_unlink(newobj->path);
            free(newobj->path);
            newobj->path = NULL;
            return NULL;
        }

        if (FSTAT(fd, &sb)) {
            close(fd);
            cli_unlink(newobj->path);
            free(newobj->path);
            newobj->path = NULL;
            return NULL;
        }

        if (sb.st_size) {
            begin = calloc(1, sb.st_size+1);
            if (!(begin)) {
                close(fd);
                cli_unlink(newobj->path);
                free(newobj->path);
                newobj->path = NULL;
                return NULL;
            }

            if (read(fd, begin, sb.st_size) != sb.st_size) {
                close(fd);
                cli_unlink(newobj->path);
                free(newobj->path);
                newobj->path = NULL;
                free(begin);
                return NULL;
            }

            p3 = begin;
            objsize2 = sb.st_size;
            while ((size_t)(p3 - begin) < objsize2 && isspace(p3[0])) {
                p3++;
                objsize2--;
            }

            switch (*p3) {
                case '(':
                case '<':
                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL);
                    free(begin);
                    break;
                default:
                    for (i=0; i < objsize2; i++) {
                        if (p3[i] >= 0x7f) {
                            likelyutf=1;
                            break;
                        }
                    }

                    res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL;

                    if (!(res)) {
                        res = begin;
                        res[objsize2] = '\0';
                    } else {
                        free(begin);
                    }
            }
        }

        close(fd);
        cli_unlink(newobj->path);
        free(newobj->path);
        newobj->path = NULL;

        if (endchar)
            *endchar = p2;

        return res;
    }

    if (*p1 == '<') {
        /* Hex string */

        p2 = p1+1;
        while ((size_t)(p2 - q) < objsize && *p2 != '>')
            p2++;

        if ((size_t)(p2 - q) == objsize) {
            return NULL;
        }

        res = cli_calloc(1, (p2 - p1) + 2);
        if (!(res))
            return NULL;

        strncpy(res, p1, (p2 - p1) + 1);
        if (endchar)
            *endchar = p2;

        return res;
    }

    /* We should be at the start of a string literal (...) here */
    if (*p1 != '(')
        return NULL;

    /* Make a best effort to find the end of the string and determine if UTF-* */
    p2 = ++p1;
    while (p2 < objstart + objsize) {
        int shouldbreak=0;

        if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0'))
            likelyutf = 1;

        switch (*p2) {
            case '\\':
                p2++;
                break;
            case ')':
                shouldbreak=1;
                break;
        }

        if (shouldbreak) {
            p2--;
            break;
        }

        p2++;
    }

    if (p2 == objstart + objsize)
        return NULL;

    len = (size_t)(p2 - p1) + 1;

    if (likelyutf == 0) {
        /* We're not UTF-*, so just make a copy of the string and return that */
        res = cli_calloc(1, len+1);
        if (!(res))
            return NULL;

        memcpy(res, p1, len);
        res[len] = '\0';
        if (endchar)
            *endchar = p2;

        return res;
    }

    res = pdf_convert_utf(p1, len);

    if (res && endchar)
        *endchar = p2;

    return res;
}