C++ (Cpp) pdf_parse_dictの例

コード例 #1

0

ファイルを表示

ファイル: pdf-parse.c プロジェクト: FabriceSalvaire/mupdf-v1.3

pdf_obj *
pdf_parse_stm_obj(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
{
	pdf_token tok;
	fz_context *ctx = file->ctx;

	tok = pdf_lex(file, buf);

	switch (tok)
	{
	case PDF_TOK_OPEN_ARRAY:
		return pdf_parse_array(doc, file, buf);
	case PDF_TOK_OPEN_DICT:
		return pdf_parse_dict(doc, file, buf);
	case PDF_TOK_NAME: return pdf_new_name(doc, buf->scratch); break;
	case PDF_TOK_REAL: return pdf_new_real(doc, buf->f); break;
	case PDF_TOK_STRING: return pdf_new_string(doc, buf->scratch, buf->len); break;
	case PDF_TOK_TRUE: return pdf_new_bool(doc, 1); break;
	case PDF_TOK_FALSE: return pdf_new_bool(doc, 0); break;
	case PDF_TOK_NULL: return pdf_new_null(doc); break;
	case PDF_TOK_INT: return pdf_new_int(doc, buf->i); break;
	default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream");
	}
	return NULL; /* Stupid MSVC */
}

コード例 #2

0

ファイルを表示

ファイル: pdf-interpret.c プロジェクト: ArtifexSoftware/mupdf

static fz_image *
parse_inline_image(fz_context *ctx, pdf_csi *csi, fz_stream *stm)
{
	pdf_document *doc = csi->doc;
	pdf_obj *rdb = csi->rdb;
	pdf_obj *obj = NULL;
	fz_image *img = NULL;
	int ch, found;

	fz_var(obj);
	fz_var(img);

	fz_try(ctx)
	{
		obj = pdf_parse_dict(ctx, doc, stm, &doc->lexbuf.base);

		/* read whitespace after ID keyword */
		ch = fz_read_byte(ctx, stm);
		if (ch == '\r')
			if (fz_peek_byte(ctx, stm) == '\n')
				fz_read_byte(ctx, stm);

		img = pdf_load_inline_image(ctx, doc, rdb, obj, stm);

		/* find EI */
		found = 0;
		ch = fz_read_byte(ctx, stm);
		do
		{
			while (ch != 'E' && ch != EOF)
				ch = fz_read_byte(ctx, stm);
			if (ch == 'E')
			{
				ch = fz_read_byte(ctx, stm);
				if (ch == 'I')
				{
					ch = fz_peek_byte(ctx, stm);
					if (ch == ' ' || ch <= 32 || ch == EOF || ch == '<' || ch == '/')
					{
						found = 1;
						break;
					}
				}
			}
		} while (ch != EOF);
		if (!found)
			fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error after inline image");
	}
	fz_always(ctx)
	{
		pdf_drop_obj(ctx, obj);
	}
	fz_catch(ctx)
	{
		fz_drop_image(ctx, img);
		fz_rethrow(ctx);
	}

	return img;
}

コード例 #3

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: jason5001001/dataextractor

fz_error
pdf_parse_stm_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
{
	fz_error error;
	int tok;
	int len;

	error = pdf_lex(&tok, file, buf, cap, &len);
	if (error)
		return fz_rethrow(error, "cannot parse token in object stream");

	switch (tok)
	{
	case PDF_TOK_OPEN_ARRAY:
		error = pdf_parse_array(op, xref, file, buf, cap);
		if (error)
			return fz_rethrow(error, "cannot parse object stream");
		break;
	case PDF_TOK_OPEN_DICT:
		error = pdf_parse_dict(op, xref, file, buf, cap);
		if (error)
			return fz_rethrow(error, "cannot parse object stream");
		break;
	case PDF_TOK_NAME: *op = fz_new_name(buf); break;
	case PDF_TOK_REAL: *op = fz_new_real(fz_atof(buf)); break;
	case PDF_TOK_STRING: *op = fz_new_string(buf, len); break;
	case PDF_TOK_TRUE: *op = fz_new_bool(1); break;
	case PDF_TOK_FALSE: *op = fz_new_bool(0); break;
	case PDF_TOK_NULL: *op = fz_new_null(); break;
	case PDF_TOK_INT: *op = fz_new_int(atoi(buf)); break;
	default: return fz_throw("unknown token in object stream");
	}

	return fz_okay;
}

コード例 #4

0

ファイルを表示

ファイル: pdf_xref.c プロジェクト: Shelnutt2/AJournal

static fz_error
pdf_read_old_trailer(pdf_xref *xref, char *buf, int cap)
{
	fz_error error;
	int len;
	char *s;
	int n;
	int t;
	int tok;
	int c;

	fz_read_line(xref->file, buf, cap);
	if (strncmp(buf, "xref", 4) != 0)
		return fz_throw("cannot find xref marker");

	while (1)
	{
		c = fz_peek_byte(xref->file);
		if (!(c >= '0' && c <= '9'))
			break;

		fz_read_line(xref->file, buf, cap);
		s = buf;
		fz_strsep(&s, " "); /* ignore ofs */
		if (!s)
			return fz_throw("invalid range marker in xref");
		len = atoi(fz_strsep(&s, " "));

		/* broken pdfs where the section is not on a separate line */
		if (s && *s != '\0')
			fz_seek(xref->file, -(2 + (int)strlen(s)), 1);

		t = fz_tell(xref->file);
		if (t < 0)
			return fz_throw("cannot tell in file");

		fz_seek(xref->file, t + 20 * len, 0);
	}

	error = pdf_lex(&tok, xref->file, buf, cap, &n);
	if (error)
		return fz_rethrow(error, "cannot parse trailer");
	if (tok != PDF_TOK_TRAILER)
		return fz_throw("expected trailer marker");

	error = pdf_lex(&tok, xref->file, buf, cap, &n);
	if (error)
		return fz_rethrow(error, "cannot parse trailer");
	if (tok != PDF_TOK_OPEN_DICT)
		return fz_throw("expected trailer dictionary");

	error = pdf_parse_dict(&xref->trailer, xref, xref->file, buf, cap);
	if (error)
		return fz_rethrow(error, "cannot parse trailer");
	return fz_okay;
}

コード例 #5

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: kzmkv/SumatraPDF

pdf_obj *
pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
{
    int tok;
    fz_context *ctx = file->ctx;

    tok = pdf_lex(file, buf);
    /* RJW: "cannot parse token in object stream") */

    switch (tok)
    {
    case PDF_TOK_OPEN_ARRAY:
        return pdf_parse_array(xref, file, buf);
    /* RJW: "cannot parse object stream" */
    case PDF_TOK_OPEN_DICT:
        return pdf_parse_dict(xref, file, buf);
    /* RJW: "cannot parse object stream" */
    case PDF_TOK_NAME:
        return fz_new_name(ctx, buf->scratch);
        break;
    case PDF_TOK_REAL:
        return pdf_new_real(ctx, buf->f);
        break;
    case PDF_TOK_STRING:
        return pdf_new_string(ctx, buf->scratch, buf->len);
        break;
    case PDF_TOK_TRUE:
        return pdf_new_bool(ctx, 1);
        break;
    case PDF_TOK_FALSE:
        return pdf_new_bool(ctx, 0);
        break;
    case PDF_TOK_NULL:
        return pdf_new_null(ctx);
        break;
    case PDF_TOK_INT:
        return pdf_new_int(ctx, buf->i);
        break;
    default:
        fz_throw(ctx, "unknown token in object stream");
    }
    return NULL; /* Stupid MSVC */
}

コード例 #6

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: kzmkv/SumatraPDF

pdf_obj *
pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
{
    pdf_obj *dict;
    pdf_obj *key = NULL;
    pdf_obj *val = NULL;
    int tok;
    int a, b;
    fz_context *ctx = file->ctx;

    dict = pdf_new_dict(ctx, 8);

    fz_var(key);
    fz_var(val);

    fz_try(ctx)
    {
        while (1)
        {
            tok = pdf_lex(file, buf);
skip:
            if (tok == PDF_TOK_CLOSE_DICT)
                break;

            /* for BI .. ID .. EI in content streams */
            if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
                break;

            if (tok != PDF_TOK_NAME)
                fz_throw(ctx, "invalid key in dict");

            key = fz_new_name(ctx, buf->scratch);

            tok = pdf_lex(file, buf);

            switch (tok)
            {
            case PDF_TOK_OPEN_ARRAY:
                /* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1643 */
                fz_try(ctx)
                {
                    val = pdf_parse_array(xref, file, buf);
                }
                fz_catch(ctx)
                {
                    fz_warn(ctx, "ignoring broken array for '%s'", pdf_to_name(key));
                    pdf_drop_obj(key);
                    val = key = NULL;
                    do
                        tok = pdf_lex(file, buf);
                    while (tok != PDF_TOK_CLOSE_DICT && tok != PDF_TOK_CLOSE_ARRAY &&
                            tok != PDF_TOK_EOF && tok != PDF_TOK_OPEN_ARRAY && tok != PDF_TOK_OPEN_DICT);
                    if (tok == PDF_TOK_CLOSE_DICT)
                        goto skip;
                    if (tok == PDF_TOK_CLOSE_ARRAY)
                        continue;
                    fz_throw(ctx, "cannot make sense of broken array after all");
                }
                break;

            case PDF_TOK_OPEN_DICT:
                val = pdf_parse_dict(xref, file, buf);
                break;

            case PDF_TOK_NAME:
                val = fz_new_name(ctx, buf->scratch);
                break;
            case PDF_TOK_REAL:
                val = pdf_new_real(ctx, buf->f);
                break;
            case PDF_TOK_STRING:
                val = pdf_new_string(ctx, buf->scratch, buf->len);
                break;
            case PDF_TOK_TRUE:
                val = pdf_new_bool(ctx, 1);
                break;
            case PDF_TOK_FALSE:
                val = pdf_new_bool(ctx, 0);
                break;
            case PDF_TOK_NULL:
                val = pdf_new_null(ctx);
                break;

            case PDF_TOK_INT:
                /* 64-bit to allow for numbers > INT_MAX and overflow */
                a = buf->i;
                tok = pdf_lex(file, buf);
                if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
                        (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
                {
                    val = pdf_new_int(ctx, a);
                    fz_dict_put(dict, key, val);
                    pdf_drop_obj(val);
                    val = NULL;
                    pdf_drop_obj(key);
                    key = NULL;
                    goto skip;
                }
                if (tok == PDF_TOK_INT)
                {
                    b = buf->i;
                    tok = pdf_lex(file, buf);
                    if (tok == PDF_TOK_R)
                    {
                        val = pdf_new_indirect(ctx, a, b, xref);
                        break;
                    }
                }
                fz_throw(ctx, "invalid indirect reference in dict");

            default:
                fz_throw(ctx, "unknown token in dict");
            }

            fz_dict_put(dict, key, val);
            pdf_drop_obj(val);
            val = NULL;
            pdf_drop_obj(key);
            key = NULL;
        }
    }
    fz_catch(ctx)
    {
        pdf_drop_obj(dict);
        pdf_drop_obj(key);
        pdf_drop_obj(val);
        fz_throw(ctx, "cannot parse dict");
    }
    return dict;
}

コード例 #7

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: kzmkv/SumatraPDF

pdf_obj *
pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
{
    pdf_obj *ary = NULL;
    pdf_obj *obj = NULL;
    int a = 0, b = 0, n = 0;
    int tok;
    fz_context *ctx = file->ctx;
    pdf_obj *op;

    fz_var(obj);

    ary = pdf_new_array(ctx, 4);

    fz_try(ctx)
    {
        while (1)
        {
            tok = pdf_lex(file, buf);

            if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
            {
                if (n > 0)
                {
                    obj = pdf_new_int(ctx, a);
                    pdf_array_push(ary, obj);
                    pdf_drop_obj(obj);
                    obj = NULL;
                }
                if (n > 1)
                {
                    obj = pdf_new_int(ctx, b);
                    pdf_array_push(ary, obj);
                    pdf_drop_obj(obj);
                    obj = NULL;
                }
                n = 0;
            }

            if (tok == PDF_TOK_INT && n == 2)
            {
                obj = pdf_new_int(ctx, a);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                a = b;
                n --;
            }

            switch (tok)
            {
            case PDF_TOK_CLOSE_ARRAY:
                op = ary;
                goto end;

            case PDF_TOK_INT:
                if (n == 0)
                    a = buf->i;
                if (n == 1)
                    b = buf->i;
                n ++;
                break;

            case PDF_TOK_R:
                if (n != 2)
                    fz_throw(ctx, "cannot parse indirect reference in array");
                obj = pdf_new_indirect(ctx, a, b, xref);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                n = 0;
                break;

            case PDF_TOK_OPEN_ARRAY:
                obj = pdf_parse_array(xref, file, buf);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;

            case PDF_TOK_OPEN_DICT:
                obj = pdf_parse_dict(xref, file, buf);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;

            case PDF_TOK_NAME:
                obj = fz_new_name(ctx, buf->scratch);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;
            case PDF_TOK_REAL:
                obj = pdf_new_real(ctx, buf->f);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;
            case PDF_TOK_STRING:
                obj = pdf_new_string(ctx, buf->scratch, buf->len);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;
            case PDF_TOK_TRUE:
                obj = pdf_new_bool(ctx, 1);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;
            case PDF_TOK_FALSE:
                obj = pdf_new_bool(ctx, 0);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;
            case PDF_TOK_NULL:
                obj = pdf_new_null(ctx);
                pdf_array_push(ary, obj);
                pdf_drop_obj(obj);
                obj = NULL;
                break;

            default:
                fz_throw(ctx, "cannot parse token in array");
            }
        }
end:
        {}
    }
    fz_catch(ctx)
    {
        pdf_drop_obj(obj);
        pdf_drop_obj(ary);
        fz_throw(ctx, "cannot parse array");
    }
    return op;
}

コード例 #8

0

ファイルを表示

ファイル: pdfng.c プロジェクト: hanah0613/clamav-devel

struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
{
    struct pdf_array *res=NULL;
    struct pdf_array_node *node=NULL;
    const char *objstart;
    char *end;
    int in_string=0, ninner=0;

    /* Sanity checking */
    if (!(pdf) || !(obj) || !(begin))
        return NULL;

    objstart = obj->start + pdf->map;

    if (begin < objstart || (size_t)(begin - objstart) >= objsz)
        return NULL;

    if (begin[0] != '[')
        return NULL;

    /* Find the end of the array */
    end = begin;
    while ((size_t)(end - objstart) < objsz) {
        if (in_string) {
            if (*end == '\\') {
                end += 2;
                continue;
            }

            if (*end == ')')
                in_string = 0;

            end++;
            continue;
        }

        switch (*end) {
            case '(':
                in_string=1;
                break;
            case '[':
                ninner++;
                break;
            case ']':
                ninner--;
                break;
        }

        if (*end == ']' && ninner == 0)
            break;

        end++;
    }

    /* More sanity checking */
    if ((size_t)(end - objstart) == objsz)
        return NULL;

    if (*end != ']')
        return NULL;

    res = cli_calloc(1, sizeof(struct pdf_array));
    if (!(res))
        return NULL;

    begin++;
    while (begin < end) {
        char *val=NULL, *p1;
        struct pdf_array *arr=NULL;
        struct pdf_dict *dict=NULL;

        while (begin < end && isspace(begin[0]))
            begin++;

        if (begin == end)
            break;

        switch (begin[0]) {
            case '<':
                if ((size_t)(begin - objstart) < objsz - 2 && begin[1] == '<') {
                    dict = pdf_parse_dict(pdf, obj, objsz, begin, &begin);
                    begin+=2;
                    break;
                }

                /* Not a dictionary. Intentially fall through. */
            case '(':
                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin, NULL);
                begin += 2;
                break;
            case '[':
                /* XXX We should have a recursion counter here */
                arr = pdf_parse_array(pdf, obj, objsz, begin, &begin);
                begin+=1;
                break;
            default:
                p1 = end;
                if (!is_object_reference(begin, &p1, NULL)) {
                    p1 = begin+1;
                    while (p1 < end && !isspace(p1[0]))
                        p1++;
                }

                val = cli_calloc((p1 - begin) + 2, 1);
                if (!(val))
                    break;

                strncpy(val, begin, p1 - begin);
                val[p1 - begin] = '\0';

                begin = p1;
                break;
        }

        /* Parse error, just return what we could */
        if (!(val) && !(arr) && !(dict))
            break;

        if (!(node)) {
            res->nodes = res->tail = node = calloc(1, sizeof(struct pdf_array_node));
            if (!(node)) {
                if (dict)
                    pdf_free_dict(dict);
                if (val)
                    free(val);
                if (arr)
                    pdf_free_array(arr);

                break;
            }
        } else {
            node = calloc(1, sizeof(struct pdf_array_node));
            if (!(node)) {
                if (dict)
                    pdf_free_dict(dict);
                if (val)
                    free(val);
                if (arr)
                    pdf_free_array(arr);

                break;
            }

            node->prev = res->tail;
            if (res->tail)
                res->tail->next = node;
            res->tail = node;
        }

        if (val != NULL) {
            node->type = PDF_ARR_STRING;
            node->data = val;
            node->datasz = strlen(val);
        } else if (dict != NULL) {
            node->type = PDF_ARR_DICT;
            node->data = dict;
            node->datasz = sizeof(struct pdf_dict);
        } else {
            node->type = PDF_ARR_ARRAY;
            node->data = arr;
            node->datasz = sizeof(struct pdf_array);
        }
    }

    if (endchar)
        *endchar = end;

    return res;
}

コード例 #9

0

ファイルを表示

ファイル: pdfng.c プロジェクト: hanah0613/clamav-devel

struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
{
    struct pdf_dict *res=NULL;
    struct pdf_dict_node *node=NULL;
    const char *objstart;
    char *end;
    unsigned int in_string=0, ninner=0;

    /* Sanity checking */
    if (!(pdf) || !(obj) || !(begin))
        return NULL;

    objstart = (const char *)(obj->start + pdf->map);

    if (begin < objstart || (size_t)(begin - objstart) >= objsz - 2)
        return NULL;

    if (begin[0] != '<' || begin[1] != '<')
        return NULL;

    /* Find the end of the dictionary */
    end = begin;
    while ((size_t)(end - objstart) < objsz) {
        int increment=1;
        if (in_string) {
            if (*end == '\\') {
                end += 2;
                continue;
            }

            if (*end == ')')
                in_string = 0;

            end++;
            continue;
        }

        switch (*end) {
            case '(':
                in_string=1;
                break;
            case '<':
                if ((size_t)(end - objstart) <= objsz - 2 && end[1] == '<')
                    ninner++;
                increment=2;
                break;
            case '>':
                if ((size_t)(end - objstart) <= objsz - 2 && end[1] == '>')
                    ninner--;
                increment=2;
                break;
        }

        if ((size_t)(end - objstart) <= objsz - 2)
            if (end[0] == '>' && end[1] == '>' && ninner == 0)
                break;

        end += increment;
    }

    /* More sanity checking */
    if ((size_t)(end - objstart) >= objsz - 2)
        return NULL;

    if (end[0] != '>' || end[1] != '>')
        return NULL;

    res = cli_calloc(1, sizeof(struct pdf_dict));
    if (!(res))
        return NULL;

    /* Loop through each element of the dictionary */
    begin += 2;
    while (begin < end) {
        char *val=NULL, *key=NULL, *p1, *p2;
        struct pdf_dict *dict=NULL;
        struct pdf_array *arr=NULL;
        unsigned int nhex=0, i;

        /* Skip any whitespaces */
        while (begin < end && isspace(begin[0]))
            begin++;

        if (begin == end)
            break;

        /* Get the key */
        p1 = begin+1;
        while (p1 < end && !isspace(p1[0])) {
            int breakout=0;

            switch (*p1) {
                case '<':
                case '[':
                case '(':
                case '/':
                case '\r':
                case '\n':
                case ' ':
                case '\t':
                    breakout=1;
                    break;
                case '#':
                    /* Key name obfuscated with hex characters */
                    nhex++;
                    if (p1 > end-3) {
                        return res;
                    }

                    break;
            }
            
            if (breakout)
                break;

            p1++;
        }

        if (p1 == end)
            break;

        key = cli_calloc((p1 - begin) + 2, 1);
        if (!(key))
            break;

        if (nhex == 0) {
            /* Key isn't obfuscated with hex. Just copy the string */
            strncpy(key, begin, p1 - begin);
            key[p1 - begin] = '\0';
        } else {
            for (i=0, p2 = begin; p2 < p1; p2++, i++) {
                if (*p2 == '#') {
                    cli_hex2str_to(p2+1, key+i, 2);
                    p2 += 2;
                } else {
                    key[i] = *p2;
                }
            }
        }

        /* Now for the value */
        begin = p1;

        /* Skip any whitespaces */
        while (begin < end && isspace(begin[0]))
            begin++;

        if (begin == end) {
            free(key);
            break;
        }

        switch (begin[0]) {
            case '(':
                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL);
                begin = p1+2;
                break;
            case '[':
                arr = pdf_parse_array(pdf, obj, objsz, begin, &p1);
                begin = p1+1;
                break;
            case '<':
                if ((size_t)(begin - objstart) < objsz - 2) {
                    if (begin[1] == '<') {
                        dict = pdf_parse_dict(pdf, obj, objsz, begin, &p1);
                        begin = p1+2;
                        break;
                    }
                }

                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL);
                begin = p1+2;
                break;
            default:
                p1 = (begin[0] == '/') ? begin+1 : begin;
                while (p1 < end) {
                    int shouldbreak = 0;
                    switch (p1[0]) {
                        case '>':
                        case '/':
                            shouldbreak=1;
                            break;
                    }

                    if (shouldbreak)
                        break;

                    p1++;
                }

                is_object_reference(begin, &p1, NULL);

                val = cli_calloc((p1 - begin) + 2, 1);
                if (!(val))
                    break;

                strncpy(val, begin, p1 - begin);
                val[p1 - begin] = '\0';

                if (p1[0] != '/')
                    begin = p1+1;
                else
                    begin = p1;

                break;
        }

        if (!(val) && !(dict) && !(arr)) {
            free(key);
            break;
        }

        if (!(res->nodes)) {
            res->nodes = res->tail = node = cli_calloc(1, sizeof(struct pdf_dict_node));
            if (!(node)) {
                free(key);
                if (dict)
                    pdf_free_dict(dict);
                if (val)
                    free(val);
                if (arr)
                    pdf_free_array(arr);
                break;
            }
        } else {
            node = calloc(1, sizeof(struct pdf_dict_node));
            if (!(node)) {
                free(key);
                if (dict)
                    pdf_free_dict(dict);
                if (val)
                    free(val);
                if (arr)
                    pdf_free_array(arr);
                break;
            }

            node->prev = res->tail;
            if (res->tail)
                res->tail->next = node;
            res->tail = node;
        }

        node->key = key;
        if ((val)) {
            node->value = val;
            node->valuesz = strlen(val);
            node->type = PDF_DICT_STRING;
        } else if ((arr)) {
            node->value = arr;
            node->valuesz = sizeof(struct pdf_array);
            node->type = PDF_DICT_ARRAY;
        } else if ((dict)) {
            node->value = dict;
            node->valuesz = sizeof(struct pdf_dict);
            node->type = PDF_DICT_DICT;
        }
    }

    if (endchar)
        *endchar = end;

    return res;
}

コード例 #10

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: jason5001001/dataextractor

fz_error
pdf_parse_dict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
{
	fz_error error = fz_okay;
	fz_obj *dict = NULL;
	fz_obj *key = NULL;
	fz_obj *val = NULL;
	int tok;
	int len;
	int a, b;

	dict = fz_new_dict(8);

	while (1)
	{
		error = pdf_lex(&tok, file, buf, cap, &len);
		if (error)
		{
			fz_drop_obj(dict);
			return fz_rethrow(error, "cannot parse dict");
		}

skip:
		if (tok == PDF_TOK_CLOSE_DICT)
		{
			*op = dict;
			return fz_okay;
		}

		/* for BI .. ID .. EI in content streams */
		if (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID"))
		{
			*op = dict;
			return fz_okay;
		}

		if (tok != PDF_TOK_NAME)
		{
			fz_drop_obj(dict);
			return fz_throw("invalid key in dict");
		}

		key = fz_new_name(buf);

		error = pdf_lex(&tok, file, buf, cap, &len);
		if (error)
		{
			fz_drop_obj(key);
			fz_drop_obj(dict);
			return fz_rethrow(error, "cannot parse dict");
		}

		switch (tok)
		{
		case PDF_TOK_OPEN_ARRAY:
			error = pdf_parse_array(&val, xref, file, buf, cap);
			if (error)
			{
				fz_drop_obj(key);
				fz_drop_obj(dict);
				return fz_rethrow(error, "cannot parse dict");
			}
			break;

		case PDF_TOK_OPEN_DICT:
			error = pdf_parse_dict(&val, xref, file, buf, cap);
			if (error)
			{
				fz_drop_obj(key);
				fz_drop_obj(dict);
				return fz_rethrow(error, "cannot parse dict");
			}
			break;

		case PDF_TOK_NAME: val = fz_new_name(buf); break;
		case PDF_TOK_REAL: val = fz_new_real(fz_atof(buf)); break;
		case PDF_TOK_STRING: val = fz_new_string(buf, len); break;
		case PDF_TOK_TRUE: val = fz_new_bool(1); break;
		case PDF_TOK_FALSE: val = fz_new_bool(0); break;
		case PDF_TOK_NULL: val = fz_new_null(); break;

		case PDF_TOK_INT:
			/* 64-bit to allow for numbers > INT_MAX and overflow */
			a = (int) strtoll(buf, 0, 10);
			error = pdf_lex(&tok, file, buf, cap, &len);
			if (error)
			{
				fz_drop_obj(key);
				fz_drop_obj(dict);
				return fz_rethrow(error, "cannot parse dict");
			}
			if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
				(tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID")))
			{
				val = fz_new_int(a);
				fz_dict_put(dict, key, val);
				fz_drop_obj(val);
				fz_drop_obj(key);
				goto skip;
			}
			if (tok == PDF_TOK_INT)
			{
				b = atoi(buf);
				error = pdf_lex(&tok, file, buf, cap, &len);
				if (error)
				{
					fz_drop_obj(key);
					fz_drop_obj(dict);
					return fz_rethrow(error, "cannot parse dict");
				}
				if (tok == PDF_TOK_R)
				{
					val = fz_new_indirect(a, b, xref);
					break;
				}
			}
			fz_drop_obj(key);
			fz_drop_obj(dict);
			return fz_throw("invalid indirect reference in dict");

		default:
			fz_drop_obj(key);
			fz_drop_obj(dict);
			return fz_throw("unknown token in dict");
		}

		fz_dict_put(dict, key, val);
		fz_drop_obj(val);
		fz_drop_obj(key);
	}
}

コード例 #11

0

ファイルを表示

ファイル: pdf-parse.c プロジェクト: Enzime/mupdf

pdf_obj *
pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
	fz_stream *file, pdf_lexbuf *buf,
	int *onum, int *ogen, fz_off_t *ostmofs, int *try_repair)
{
	pdf_obj *obj = NULL;
	int num = 0, gen = 0;
	fz_off_t stm_ofs;
	pdf_token tok;
	fz_off_t a, b;

	fz_var(obj);

	tok = pdf_lex(ctx, file, buf);
	if (tok != PDF_TOK_INT)
	{
		if (try_repair)
			*try_repair = 1;
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number");
	}
	num = buf->i;

	tok = pdf_lex(ctx, file, buf);
	if (tok != PDF_TOK_INT)
	{
		if (try_repair)
			*try_repair = 1;
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num);
	}
	gen = buf->i;

	tok = pdf_lex(ctx, file, buf);
	if (tok != PDF_TOK_OBJ)
	{
		if (try_repair)
			*try_repair = 1;
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen);
	}

	tok = pdf_lex(ctx, file, buf);

	switch (tok)
	{
	case PDF_TOK_OPEN_ARRAY:
		obj = pdf_parse_array(ctx, doc, file, buf);
		break;

	case PDF_TOK_OPEN_DICT:
		obj = pdf_parse_dict(ctx, doc, file, buf);
		break;

	case PDF_TOK_NAME: obj = pdf_new_name(ctx, doc, buf->scratch); break;
	case PDF_TOK_REAL: obj = pdf_new_real(ctx, doc, buf->f); break;
	case PDF_TOK_STRING: obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
	case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, doc, 1); break;
	case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, doc, 0); break;
	case PDF_TOK_NULL: obj = pdf_new_null(ctx, doc); break;

	case PDF_TOK_INT:
		a = buf->i;
		tok = pdf_lex(ctx, file, buf);

		if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
		{
			obj = pdf_new_int_offset(ctx, doc, a);
			goto skip;
		}
		if (tok == PDF_TOK_INT)
		{
			b = buf->i;
			tok = pdf_lex(ctx, file, buf);
			if (tok == PDF_TOK_R)
			{
				obj = pdf_new_indirect(ctx, doc, a, b);
				break;
			}
		}
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen);

	case PDF_TOK_ENDOBJ:
		obj = pdf_new_null(ctx, doc);
		goto skip;

	default:
		fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen);
	}

	fz_try(ctx)
	{
		tok = pdf_lex(ctx, file, buf);
	}
	fz_catch(ctx)
	{
		pdf_drop_obj(ctx, obj);
		fz_rethrow(ctx);
	}

skip:
	if (tok == PDF_TOK_STREAM)
	{
		int c = fz_read_byte(ctx, file);
		while (c == ' ')
			c = fz_read_byte(ctx, file);
		if (c == '\r')
		{
			c = fz_peek_byte(ctx, file);
			if (c != '\n')
				fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
			else
				fz_read_byte(ctx, file);
		}
		stm_ofs = fz_tell(ctx, file);
	}
	else if (tok == PDF_TOK_ENDOBJ)
	{
		stm_ofs = 0;
	}
	else
	{
		fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
		stm_ofs = 0;
	}

	if (onum) *onum = num;
	if (ogen) *ogen = gen;
	if (ostmofs) *ostmofs = stm_ofs;
	return obj;
}

コード例 #12

0

ファイルを表示

ファイル: pdf-repair.c プロジェクト: iezbli/zbli

int
pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root)
{
	fz_stream *file = doc->file;
	pdf_token tok;
	int stm_len;

	*stmofsp = 0;
	if (stmlenp)
		*stmlenp = -1;

	stm_len = 0;

	/* On entry to this function, we know that we've just seen
	 * '<int> <int> obj'. We expect the next thing we see to be a
	 * pdf object. Regardless of the type of thing we meet next
	 * we only need to fully parse it if it is a dictionary. */
	tok = pdf_lex(ctx, file, buf);

	if (tok == PDF_TOK_OPEN_DICT)
	{
		pdf_obj *dict, *obj;

		fz_try(ctx)
		{
			dict = pdf_parse_dict(ctx, doc, file, buf);
		}
		fz_catch(ctx)
		{
			fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
			/* Don't let a broken object at EOF overwrite a good one */
			if (file->eof)
				fz_rethrow(ctx);
			/* Silently swallow the error */
			dict = pdf_new_dict(ctx, NULL, 2);
		}

		/* We must be careful not to try to resolve any indirections
		 * here. We have just read dict, so we know it to be a non
		 * indirected dictionary. Before we look at any values that
		 * we get back from looking up in it, we need to check they
		 * aren't indirected. */

		if (encrypt || id || root)
		{
			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
			if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_XRef))
			{
				if (encrypt)
				{
					obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
					if (obj)
					{
						pdf_drop_obj(ctx, *encrypt);
						*encrypt = pdf_keep_obj(ctx, obj);
					}
				}

				if (id)
				{
					obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
					if (obj)
					{
						pdf_drop_obj(ctx, *id);
						*id = pdf_keep_obj(ctx, obj);
					}
				}

				if (root)
					*root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Root));
			}
		}

		obj = pdf_dict_get(ctx, dict, PDF_NAME_Length);
		if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
			stm_len = pdf_to_int(ctx, obj);

		if (doc->file_reading_linearly && page)
		{
			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
			if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_Page))
			{
				pdf_drop_obj(ctx, *page);
				*page = pdf_keep_obj(ctx, dict);
			}
		}

		pdf_drop_obj(ctx, dict);
	}

コード例 #13

0

ファイルを表示

ファイル: pdf_xref.c プロジェクト: Shelnutt2/AJournal

static fz_error
pdf_read_old_xref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap)
{
	fz_error error;
	int ofs, len;
	char *s;
	int n;
	int tok;
	int i;
	int c;

	fz_read_line(xref->file, buf, cap);
	if (strncmp(buf, "xref", 4) != 0)
		return fz_throw("cannot find xref marker");

	while (1)
	{
		c = fz_peek_byte(xref->file);
		if (!(c >= '0' && c <= '9'))
			break;

		fz_read_line(xref->file, buf, cap);
		s = buf;
		ofs = atoi(fz_strsep(&s, " "));
		len = atoi(fz_strsep(&s, " "));

		/* broken pdfs where the section is not on a separate line */
		if (s && *s != '\0')
		{
			fz_warn("broken xref section. proceeding anyway.");
			fz_seek(xref->file, -(2 + (int)strlen(s)), 1);
		}

		/* broken pdfs where size in trailer undershoots entries in xref sections */
		if (ofs + len > xref->len)
		{
			fz_warn("broken xref section, proceeding anyway.");
			pdf_resize_xref(xref, ofs + len);
		}

		for (i = ofs; i < ofs + len; i++)
		{
			n = fz_read(xref->file, (unsigned char *) buf, 20);
			if (n < 0)
				return fz_rethrow(n, "cannot read xref table");
			if (!xref->table[i].type)
			{
				s = buf;

				/* broken pdfs where line start with white space */
				while (*s != '\0' && iswhite(*s))
					s++;

				xref->table[i].ofs = atoi(s);
				xref->table[i].gen = atoi(s + 11);
				xref->table[i].type = s[17];
				if (s[17] != 'f' && s[17] != 'n' && s[17] != 'o')
					return fz_throw("unexpected xref type: %#x (%d %d R)", s[17], i, xref->table[i].gen);
			}
		}
	}

	error = pdf_lex(&tok, xref->file, buf, cap, &n);
	if (error)
		return fz_rethrow(error, "cannot parse trailer");
	if (tok != PDF_TOK_TRAILER)
		return fz_throw("expected trailer marker");

	error = pdf_lex(&tok, xref->file, buf, cap, &n);
	if (error)
		return fz_rethrow(error, "cannot parse trailer");
	if (tok != PDF_TOK_OPEN_DICT)
		return fz_throw("expected trailer dictionary");

	error = pdf_parse_dict(trailerp, xref, xref->file, buf, cap);
	if (error)
		return fz_rethrow(error, "cannot parse trailer");
	return fz_okay;
}

コード例 #14

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: jason5001001/dataextractor

fz_error
pdf_parse_ind_obj(fz_obj **op, pdf_xref *xref,
	fz_stream *file, char *buf, int cap,
	int *onum, int *ogen, int *ostmofs)
{
	fz_error error = fz_okay;
	fz_obj *obj = NULL;
	int num = 0, gen = 0, stm_ofs;
	int tok;
	int len;
	int a, b;

	error = pdf_lex(&tok, file, buf, cap, &len);
	if (error)
		return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
	if (tok != PDF_TOK_INT)
		return fz_throw("expected object number (%d %d R)", num, gen);
	num = atoi(buf);

	error = pdf_lex(&tok, file, buf, cap, &len);
	if (error)
		return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
	if (tok != PDF_TOK_INT)
		return fz_throw("expected generation number (%d %d R)", num, gen);
	gen = atoi(buf);

	error = pdf_lex(&tok, file, buf, cap, &len);
	if (error)
		return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
	if (tok != PDF_TOK_OBJ)
		return fz_throw("expected 'obj' keyword (%d %d R)", num, gen);

	error = pdf_lex(&tok, file, buf, cap, &len);
	if (error)
		return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);

	switch (tok)
	{
	case PDF_TOK_OPEN_ARRAY:
		error = pdf_parse_array(&obj, xref, file, buf, cap);
		if (error)
			return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
		break;

	case PDF_TOK_OPEN_DICT:
		error = pdf_parse_dict(&obj, xref, file, buf, cap);
		if (error)
			return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
		break;

	case PDF_TOK_NAME: obj = fz_new_name(buf); break;
	case PDF_TOK_REAL: obj = fz_new_real(fz_atof(buf)); break;
	case PDF_TOK_STRING: obj = fz_new_string(buf, len); break;
	case PDF_TOK_TRUE: obj = fz_new_bool(1); break;
	case PDF_TOK_FALSE: obj = fz_new_bool(0); break;
	case PDF_TOK_NULL: obj = fz_new_null(); break;

	case PDF_TOK_INT:
		a = atoi(buf);
		error = pdf_lex(&tok, file, buf, cap, &len);
		if (error)
			return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
		if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
		{
			obj = fz_new_int(a);
			goto skip;
		}
		if (tok == PDF_TOK_INT)
		{
			b = atoi(buf);
			error = pdf_lex(&tok, file, buf, cap, &len);
			if (error)
				return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
			if (tok == PDF_TOK_R)
			{
				obj = fz_new_indirect(a, b, xref);
				break;
			}
		}
		return fz_throw("expected 'R' keyword (%d %d R)", num, gen);

	case PDF_TOK_ENDOBJ:
		obj = fz_new_null();
		goto skip;

	default:
		return fz_throw("syntax error in object (%d %d R)", num, gen);
	}

	error = pdf_lex(&tok, file, buf, cap, &len);
	if (error)
	{
		fz_drop_obj(obj);
		return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
	}

skip:
	if (tok == PDF_TOK_STREAM)
	{
		int c = fz_read_byte(file);
		while (c == ' ')
			c = fz_read_byte(file);
		if (c == '\r')
		{
			c = fz_peek_byte(file);
			if (c != '\n')
				fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen);
			else
				fz_read_byte(file);
		}
		stm_ofs = fz_tell(file);
	}
	else if (tok == PDF_TOK_ENDOBJ)
	{
		stm_ofs = 0;
	}
	else
	{
		fz_warn("expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
		stm_ofs = 0;
	}

	if (onum) *onum = num;
	if (ogen) *ogen = gen;
	if (ostmofs) *ostmofs = stm_ofs;
	*op = obj;
	return fz_okay;
}

コード例 #15

0

ファイルを表示

ファイル: pdf-parse.c プロジェクト: FabriceSalvaire/mupdf-v1.3

pdf_obj *
pdf_parse_dict(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
{
	pdf_obj *dict;
	pdf_obj *key = NULL;
	pdf_obj *val = NULL;
	pdf_token tok;
	int a, b;
	fz_context *ctx = file->ctx;

	dict = pdf_new_dict(doc, 8);

	fz_var(key);
	fz_var(val);

	fz_try(ctx)
	{
		while (1)
		{
			tok = pdf_lex(file, buf);
	skip:
			if (tok == PDF_TOK_CLOSE_DICT)
				break;

			/* for BI .. ID .. EI in content streams */
			if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
				break;

			if (tok != PDF_TOK_NAME)
				fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict");

			key = pdf_new_name(doc, buf->scratch);

			tok = pdf_lex(file, buf);

			switch (tok)
			{
			case PDF_TOK_OPEN_ARRAY:
				val = pdf_parse_array(doc, file, buf);
				break;

			case PDF_TOK_OPEN_DICT:
				val = pdf_parse_dict(doc, file, buf);
				break;

			case PDF_TOK_NAME: val = pdf_new_name(doc, buf->scratch); break;
			case PDF_TOK_REAL: val = pdf_new_real(doc, buf->f); break;
			case PDF_TOK_STRING: val = pdf_new_string(doc, buf->scratch, buf->len); break;
			case PDF_TOK_TRUE: val = pdf_new_bool(doc, 1); break;
			case PDF_TOK_FALSE: val = pdf_new_bool(doc, 0); break;
			case PDF_TOK_NULL: val = pdf_new_null(doc); break;

			case PDF_TOK_INT:
				/* 64-bit to allow for numbers > INT_MAX and overflow */
				a = buf->i;
				tok = pdf_lex(file, buf);
				if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
					(tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
				{
					val = pdf_new_int(doc, a);
					pdf_dict_put(dict, key, val);
					pdf_drop_obj(val);
					val = NULL;
					pdf_drop_obj(key);
					key = NULL;
					goto skip;
				}
				if (tok == PDF_TOK_INT)
				{
					b = buf->i;
					tok = pdf_lex(file, buf);
					if (tok == PDF_TOK_R)
					{
						val = pdf_new_indirect(doc, a, b);
						break;
					}
				}
				fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict");

			default:
				fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict");
			}

			pdf_dict_put(dict, key, val);
			pdf_drop_obj(val);
			val = NULL;
			pdf_drop_obj(key);
			key = NULL;
		}
	}
	fz_catch(ctx)
	{
		pdf_drop_obj(dict);
		pdf_drop_obj(key);
		pdf_drop_obj(val);
		fz_rethrow_message(ctx, "cannot parse dict");
	}
	return dict;
}

コード例 #16

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: kzmkv/SumatraPDF

pdf_obj *
pdf_parse_ind_obj(pdf_document *xref,
                  fz_stream *file, pdf_lexbuf *buf,
                  int *onum, int *ogen, int *ostmofs)
{
    pdf_obj *obj = NULL;
    int num = 0, gen = 0, stm_ofs;
    int tok;
    int a, b;
    fz_context *ctx = file->ctx;

    fz_var(obj);

    tok = pdf_lex(file, buf);
    /* RJW: cannot parse indirect object (%d %d R)", num, gen */
    if (tok != PDF_TOK_INT)
        fz_throw(ctx, "expected object number (%d %d R)", num, gen);
    num = buf->i;

    tok = pdf_lex(file, buf);
    /* RJW: "cannot parse indirect object (%d %d R)", num, gen */
    if (tok != PDF_TOK_INT)
        fz_throw(ctx, "expected generation number (%d %d R)", num, gen);
    gen = buf->i;

    tok = pdf_lex(file, buf);
    /* RJW: "cannot parse indirect object (%d %d R)", num, gen */
    if (tok != PDF_TOK_OBJ)
        fz_throw(ctx, "expected 'obj' keyword (%d %d R)", num, gen);

    tok = pdf_lex(file, buf);
    /* RJW: "cannot parse indirect object (%d %d R)", num, gen */

    switch (tok)
    {
    case PDF_TOK_OPEN_ARRAY:
        obj = pdf_parse_array(xref, file, buf);
        /* RJW: "cannot parse indirect object (%d %d R)", num, gen */
        break;

    case PDF_TOK_OPEN_DICT:
        obj = pdf_parse_dict(xref, file, buf);
        /* RJW: "cannot parse indirect object (%d %d R)", num, gen */
        break;

    case PDF_TOK_NAME:
        obj = fz_new_name(ctx, buf->scratch);
        break;
    case PDF_TOK_REAL:
        obj = pdf_new_real(ctx, buf->f);
        break;
    case PDF_TOK_STRING:
        obj = pdf_new_string(ctx, buf->scratch, buf->len);
        break;
    case PDF_TOK_TRUE:
        obj = pdf_new_bool(ctx, 1);
        break;
    case PDF_TOK_FALSE:
        obj = pdf_new_bool(ctx, 0);
        break;
    case PDF_TOK_NULL:
        obj = pdf_new_null(ctx);
        break;

    case PDF_TOK_INT:
        a = buf->i;
        tok = pdf_lex(file, buf);
        /* "cannot parse indirect object (%d %d R)", num, gen */
        if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
        {
            obj = pdf_new_int(ctx, a);
            goto skip;
        }
        if (tok == PDF_TOK_INT)
        {
            b = buf->i;
            tok = pdf_lex(file, buf);
            /* RJW: "cannot parse indirect object (%d %d R)", num, gen); */
            if (tok == PDF_TOK_R)
            {
                obj = pdf_new_indirect(ctx, a, b, xref);
                break;
            }
        }
        fz_throw(ctx, "expected 'R' keyword (%d %d R)", num, gen);

    case PDF_TOK_ENDOBJ:
        obj = pdf_new_null(ctx);
        goto skip;

    default:
        fz_throw(ctx, "syntax error in object (%d %d R)", num, gen);
    }

    fz_try(ctx)
    {
        tok = pdf_lex(file, buf);
    }
    fz_catch(ctx)
    {
        pdf_drop_obj(obj);
        fz_throw(ctx, "cannot parse indirect object (%d %d R)", num, gen);
    }

skip:
    if (tok == PDF_TOK_STREAM)
    {
        int c = fz_read_byte(file);
        while (c == ' ')
            c = fz_read_byte(file);
        if (c == '\r')
        {
            c = fz_peek_byte(file);
            if (c != '\n')
                fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
            else
                fz_read_byte(file);
        }
        stm_ofs = fz_tell(file);
    }
    else if (tok == PDF_TOK_ENDOBJ)
    {
        stm_ofs = 0;
    }
    else
    {
        fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
        stm_ofs = 0;
    }

    if (onum) *onum = num;
    if (ogen) *ogen = gen;
    if (ostmofs) *ostmofs = stm_ofs;
    return obj;
}

コード例 #17

0

ファイルを表示

ファイル: pdf_parse.c プロジェクト: jason5001001/dataextractor

fz_error
pdf_parse_array(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
{
	fz_error error = fz_okay;
	fz_obj *ary = NULL;
	fz_obj *obj = NULL;
	int a = 0, b = 0, n = 0;
	int tok;
	int len;

	ary = fz_new_array(4);

	while (1)
	{
		error = pdf_lex(&tok, file, buf, cap, &len);
		if (error)
		{
			fz_drop_obj(ary);
			return fz_rethrow(error, "cannot parse array");
		}

		if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
		{
			if (n > 0)
			{
				obj = fz_new_int(a);
				fz_array_push(ary, obj);
				fz_drop_obj(obj);
			}
			if (n > 1)
			{
				obj = fz_new_int(b);
				fz_array_push(ary, obj);
				fz_drop_obj(obj);
			}
			n = 0;
		}

		if (tok == PDF_TOK_INT && n == 2)
		{
			obj = fz_new_int(a);
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			a = b;
			n --;
		}

		switch (tok)
		{
		case PDF_TOK_CLOSE_ARRAY:
			*op = ary;
			return fz_okay;

		case PDF_TOK_INT:
			if (n == 0)
				a = atoi(buf);
			if (n == 1)
				b = atoi(buf);
			n ++;
			break;

		case PDF_TOK_R:
			if (n != 2)
			{
				fz_drop_obj(ary);
				return fz_throw("cannot parse indirect reference in array");
			}
			obj = fz_new_indirect(a, b, xref);
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			n = 0;
			break;

		case PDF_TOK_OPEN_ARRAY:
			error = pdf_parse_array(&obj, xref, file, buf, cap);
			if (error)
			{
				fz_drop_obj(ary);
				return fz_rethrow(error, "cannot parse array");
			}
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;

		case PDF_TOK_OPEN_DICT:
			error = pdf_parse_dict(&obj, xref, file, buf, cap);
			if (error)
			{
				fz_drop_obj(ary);
				return fz_rethrow(error, "cannot parse array");
			}
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;

		case PDF_TOK_NAME:
			obj = fz_new_name(buf);
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;
		case PDF_TOK_REAL:
			obj = fz_new_real(fz_atof(buf));
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;
		case PDF_TOK_STRING:
			obj = fz_new_string(buf, len);
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;
		case PDF_TOK_TRUE:
			obj = fz_new_bool(1);
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;
		case PDF_TOK_FALSE:
			obj = fz_new_bool(0);
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;
		case PDF_TOK_NULL:
			obj = fz_new_null();
			fz_array_push(ary, obj);
			fz_drop_obj(obj);
			break;

		default:
			fz_drop_obj(ary);
			return fz_throw("cannot parse token in array");
		}
	}
}