コード例 #1
0
/* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {
    /* Call sites don't need the result length so it's not accumulated. */
    while (count > 0) {
        (void) duk__append_u32(re_ctx, (duk_uint32_t) (*values++));
        count--;
    }
}
コード例 #2
0
DUK_LOCAL void duk__append_7bit(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
#if defined(DUK_USE_PREFER_SIZE)
	duk__append_u32(re_ctx, x);
#else
	DUK_ASSERT(x <= 0x7fU);
	DUK_BW_WRITE_ENSURE_U8(re_ctx->thr, &re_ctx->bw, (duk_uint8_t) x);
#endif
}
コード例 #3
0
DUK_LOCAL void duk__generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {
    duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;

    DUK_DD(DUK_DDPRINT("duk__generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
                       (void *) re_ctx, (long) r1, (long) r2, (long) direct));

    if (!direct && (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE)) {
        /*
         *  Canonicalize a range, generating result ranges as necessary.
         *  Needs to exhaustively scan the entire range (at most 65536
         *  code points).  If 'direct' is set, caller (lexer) has ensured
         *  that the range is already canonicalization compatible (this
         *  is used to avoid unnecessary canonicalization of built-in
         *  ranges like \W, which are not affected by canonicalization).
         *
         *  NOTE: here is one place where we don't want to support chars
         *  outside the BMP, because the exhaustive search would be
         *  massively larger.
         */

        duk_codepoint_t i;
        duk_codepoint_t t;
        duk_codepoint_t r_start, r_end;

        r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
        r_end = r_start;
        for (i = r1 + 1; i <= r2; i++) {
            t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);
            if (t == r_end + 1) {
                r_end = t;
            } else {
                DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
                duk__append_u32(re_ctx, (duk_uint32_t) r_start);
                duk__append_u32(re_ctx, (duk_uint32_t) r_end);
                re_ctx->nranges++;
                r_start = t;
                r_end = t;
            }
        }
        DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
        duk__append_u32(re_ctx, (duk_uint32_t) r_start);
        duk__append_u32(re_ctx, (duk_uint32_t) r_end);
        re_ctx->nranges++;
    } else {
        DUK_DD(DUK_DDPRINT("direct, emit range: [%ld,%ld]", (long) r1, (long) r2));
        duk__append_u32(re_ctx, (duk_uint32_t) r1);
        duk__append_u32(re_ctx, (duk_uint32_t) r2);
        re_ctx->nranges++;
    }
}
コード例 #4
0
DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {
    duk_context *ctx = (duk_context *) thr;
    duk_re_compiler_ctx re_ctx;
    duk_lexer_point lex_point;
    duk_hstring *h_pattern;
    duk_hstring *h_flags;
    duk__re_disjunction_info ign_disj;

    DUK_ASSERT(thr != NULL);
    DUK_ASSERT(ctx != NULL);

    /*
     *  Args validation
     */

    /* TypeError if fails */
    h_pattern = duk_require_hstring(ctx, -2);
    h_flags = duk_require_hstring(ctx, -1);

    /*
     *  Create normalized 'source' property (E5 Section 15.10.3).
     */

    /* [ ... pattern flags ] */

    duk__create_escaped_source(thr, -2);

    /* [ ... pattern flags escaped_source ] */

    /*
     *  Init compilation context
     */

    /* [ ... pattern flags escaped_source buffer ] */

    DUK_MEMZERO(&re_ctx, sizeof(re_ctx));
    DUK_LEXER_INITCTX(&re_ctx.lex);  /* duplicate zeroing, expect for (possible) NULL inits */
    re_ctx.thr = thr;
    re_ctx.lex.thr = thr;
    re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);
    re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);
    re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;
    re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;
    re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);

    DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);

    DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
                       (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));

    /*
     *  Init lexer
     */

    lex_point.offset = 0;  /* expensive init, just want to fill window */
    lex_point.line = 1;
    DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);

    /*
     *  Compilation
     */

    DUK_DD(DUK_DDPRINT("starting regexp compilation"));

    duk__append_u32(&re_ctx, DUK_REOP_SAVE);
    duk__append_u32(&re_ctx, 0);
    duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);
    duk__append_u32(&re_ctx, DUK_REOP_SAVE);
    duk__append_u32(&re_ctx, 1);
    duk__append_u32(&re_ctx, DUK_REOP_MATCH);

    /*
     *  Check for invalid backreferences; note that it is NOT an error
     *  to back-reference a capture group which has not yet been introduced
     *  in the pattern (as in /\1(foo)/); in fact, the backreference will
     *  always match!  It IS an error to back-reference a capture group
     *  which will never be introduced in the pattern.  Thus, we can check
     *  for such references only after parsing is complete.
     */

    if (re_ctx.highest_backref > re_ctx.captures) {
        DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);
    }

    /*
     *  Emit compiled regexp header: flags, ncaptures
     *  (insertion order inverted on purpose)
     */

    duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);
    duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);

    /* [ ... pattern flags escaped_source buffer ] */

    DUK_BW_COMPACT(thr, &re_ctx.bw);
    duk_to_string(ctx, -1);  /* coerce to string */

    /* [ ... pattern flags escaped_source bytecode ] */

    /*
     *  Finalize stack
     */

    duk_remove(ctx, -4);     /* -> [ ... flags escaped_source bytecode ] */
    duk_remove(ctx, -3);     /* -> [ ... escaped_source bytecode ] */

    DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
                       (duk_tval *) duk_get_tval(ctx, -1), (duk_tval *) duk_get_tval(ctx, -2)));
}
コード例 #5
0
DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {
    duk_int32_t atom_start_offset = -1;                   /* negative -> no atom matched on previous round */
    duk_int32_t atom_char_length = 0;                     /* negative -> complex atom */
    duk_uint32_t atom_start_captures = re_ctx->captures;  /* value of re_ctx->captures at start of atom */
    duk_int32_t unpatched_disjunction_split = -1;
    duk_int32_t unpatched_disjunction_jump = -1;
    duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
    duk_int32_t res_charlen = 0;  /* -1 if disjunction is complex, char length if simple */
    duk__re_disjunction_info tmp_disj;

    DUK_ASSERT(out_atom_info != NULL);

    if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
        DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);
    }
    re_ctx->recursion_depth++;

#if 0
    out_atom_info->start_captures = re_ctx->captures;
#endif

    for (;;) {
        /* atom_char_length, atom_start_offset, atom_start_offset reflect the
         * atom matched on the previous loop.  If a quantifier is encountered
         * on this loop, these are needed to handle the quantifier correctly.
         * new_atom_char_length etc are for the atom parsed on this round;
         * they're written to atom_char_length etc at the end of the round.
         */
        duk_int32_t new_atom_char_length;   /* char length of the atom parsed in this loop */
        duk_int32_t new_atom_start_offset;  /* bytecode start offset of the atom parsed in this loop
		                                     * (allows quantifiers to copy the atom bytecode)
		                                     */
        duk_uint32_t new_atom_start_captures;  /* re_ctx->captures at the start of the atom parsed in this loop */

        duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);

        DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
                           (long) re_ctx->curr_token.t,
                           (long) re_ctx->curr_token.num,
                           (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?
                           (int) re_ctx->curr_token.num : (int) '?'));

        /* set by atom case clauses */
        new_atom_start_offset = -1;
        new_atom_char_length = -1;
        new_atom_start_captures = re_ctx->captures;

        switch (re_ctx->curr_token.t) {
        case DUK_RETOK_DISJUNCTION: {
            /*
             *  The handling here is a bit tricky.  If a previous '|' has been processed,
             *  we have a pending split1 and a pending jump (for a previous match).  These
             *  need to be back-patched carefully.  See docs for a detailed example.
             */

            /* patch pending jump and split */
            if (unpatched_disjunction_jump >= 0) {
                duk_uint32_t offset;

                DUK_ASSERT(unpatched_disjunction_split >= 0);
                offset = unpatched_disjunction_jump;
                offset += duk__insert_jump_offset(re_ctx,
                                                  offset,
                                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
                /* offset is now target of the pending split (right after jump) */
                duk__insert_jump_offset(re_ctx,
                                        unpatched_disjunction_split,
                                        offset - unpatched_disjunction_split);
            }

            /* add a new pending split to the beginning of the entire disjunction */
            (void) duk__insert_u32(re_ctx,
                                   entry_offset,
                                   DUK_REOP_SPLIT1);   /* prefer direct execution */
            unpatched_disjunction_split = entry_offset + 1;   /* +1 for opcode */

            /* add a new pending match jump for latest finished alternative */
            duk__append_u32(re_ctx, DUK_REOP_JUMP);
            unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

            /* 'taint' result as complex */
            res_charlen = -1;
            break;
        }
        case DUK_RETOK_QUANTIFIER: {
            if (atom_start_offset < 0) {
                DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);
            }
            if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {
                DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);
            }
            if (atom_char_length >= 0) {
                /*
                 *  Simple atom
                 *
                 *  If atom_char_length is zero, we'll have unbounded execution time for e.g.
                 *  /()*x/.exec('x').  We can't just skip the match because it might have some
                 *  side effects (for instance, if we allowed captures in simple atoms, the
                 *  capture needs to happen).  The simple solution below is to force the
                 *  quantifier to match at most once, since the additional matches have no effect.
                 *
                 *  With a simple atom there can be no capture groups, so no captures need
                 *  to be reset.
                 */
                duk_int32_t atom_code_length;
                duk_uint32_t offset;
                duk_uint32_t qmin, qmax;

                qmin = re_ctx->curr_token.qmin;
                qmax = re_ctx->curr_token.qmax;
                if (atom_char_length == 0) {
                    /* qmin and qmax will be 0 or 1 */
                    if (qmin > 1) {
                        qmin = 1;
                    }
                    if (qmax > 1) {
                        qmax = 1;
                    }
                }

                duk__append_u32(re_ctx, DUK_REOP_MATCH);   /* complete 'sub atom' */
                atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - atom_start_offset);

                offset = atom_start_offset;
                if (re_ctx->curr_token.greedy) {
                    offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);
                    offset += duk__insert_u32(re_ctx, offset, qmin);
                    offset += duk__insert_u32(re_ctx, offset, qmax);
                    offset += duk__insert_u32(re_ctx, offset, atom_char_length);
                    offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
                } else {
                    offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);
                    offset += duk__insert_u32(re_ctx, offset, qmin);
                    offset += duk__insert_u32(re_ctx, offset, qmax);
                    offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
                }
                DUK_UNREF(offset);  /* silence scan-build warning */
            } else {
                /*
                 *  Complex atom
                 *
                 *  The original code is used as a template, and removed at the end
                 *  (this differs from the handling of simple quantifiers).
                 *
                 *  NOTE: there is no current solution for empty atoms in complex
                 *  quantifiers.  This would need some sort of a 'progress' instruction.
                 *
                 *  XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
                 */
                duk_int32_t atom_code_length;
                duk_uint32_t atom_copies;
                duk_uint32_t tmp_qmin, tmp_qmax;

                /* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
                atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?
                              re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;
                if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {
                    DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);
                }

                /* wipe the capture range made by the atom (if any) */
                DUK_ASSERT(atom_start_captures <= re_ctx->captures);
                if (atom_start_captures != re_ctx->captures) {
                    DUK_ASSERT(atom_start_captures < re_ctx->captures);
                    DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
                                         (long) atom_start_captures, (long) re_ctx->captures));

                    /* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
                    duk__insert_u32(re_ctx, atom_start_offset, (re_ctx->captures - atom_start_captures) * 2);
                    duk__insert_u32(re_ctx, atom_start_offset, (atom_start_captures + 1) * 2);
                    duk__insert_u32(re_ctx, atom_start_offset, DUK_REOP_WIPERANGE);
                } else {
                    DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
                                         (long) atom_start_captures));
                }

                atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;

                /* insert the required matches (qmin) by copying the atom */
                tmp_qmin = re_ctx->curr_token.qmin;
                tmp_qmax = re_ctx->curr_token.qmax;
                while (tmp_qmin > 0) {
                    duk__append_slice(re_ctx, atom_start_offset, atom_code_length);
                    tmp_qmin--;
                    if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {
                        tmp_qmax--;
                    }
                }
                DUK_ASSERT(tmp_qmin == 0);

                /* insert code for matching the remainder - infinite or finite */
                if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {
                    /* reuse last emitted atom for remaining 'infinite' quantifier */

                    if (re_ctx->curr_token.qmin == 0) {
                        /* Special case: original qmin was zero so there is nothing
                         * to repeat.  Emit an atom copy but jump over it here.
                         */
                        duk__append_u32(re_ctx, DUK_REOP_JUMP);
                        duk__append_jump_offset(re_ctx, atom_code_length);
                        duk__append_slice(re_ctx, atom_start_offset, atom_code_length);
                    }
                    if (re_ctx->curr_token.greedy) {
                        duk__append_u32(re_ctx, DUK_REOP_SPLIT2);   /* prefer jump */
                    } else {
                        duk__append_u32(re_ctx, DUK_REOP_SPLIT1);   /* prefer direct */
                    }
                    duk__append_jump_offset(re_ctx, -atom_code_length - 1);  /* -1 for opcode */
                } else {
                    /*
                     *  The remaining matches are emitted as sequence of SPLITs and atom
                     *  copies; the SPLITs skip the remaining copies and match the sequel.
                     *  This sequence needs to be emitted starting from the last copy
                     *  because the SPLITs are variable length due to the variable length
                     *  skip offset.  This causes a lot of memory copying now.
                     *
                     *  Example structure (greedy, match maximum # atoms):
                     *
                     *      SPLIT1 LSEQ
                     *      (atom)
                     *      SPLIT1 LSEQ    ; <- the byte length of this instruction is needed
                     *      (atom)         ; to encode the above SPLIT1 correctly
                     *      ...
                     *   LSEQ:
                     */
                    duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
                    while (tmp_qmax > 0) {
                        duk__insert_slice(re_ctx, offset, atom_start_offset, atom_code_length);
                        if (re_ctx->curr_token.greedy) {
                            duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1);   /* prefer direct */
                        } else {
                            duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2);   /* prefer jump */
                        }
                        duk__insert_jump_offset(re_ctx,
                                                offset + 1,   /* +1 for opcode */
                                                (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
                        tmp_qmax--;
                    }
                }

                /* remove the original 'template' atom */
                duk__remove_slice(re_ctx, atom_start_offset, atom_code_length);
            }

            /* 'taint' result as complex */
            res_charlen = -1;
            break;
        }
        case DUK_RETOK_ASSERT_START: {
            duk__append_u32(re_ctx, DUK_REOP_ASSERT_START);
            break;
        }
        case DUK_RETOK_ASSERT_END: {
            duk__append_u32(re_ctx, DUK_REOP_ASSERT_END);
            break;
        }
        case DUK_RETOK_ASSERT_WORD_BOUNDARY: {
            duk__append_u32(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);
            break;
        }
        case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {
            duk__append_u32(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
            break;
        }
        case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:
        case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {
            duk_uint32_t offset;
            duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?
                                  DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;

            offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
            duk__parse_disjunction(re_ctx, 0, &tmp_disj);
            duk__append_u32(re_ctx, DUK_REOP_MATCH);

            (void) duk__insert_u32(re_ctx, offset, opcode);
            (void) duk__insert_jump_offset(re_ctx,
                                           offset + 1,   /* +1 for opcode */
                                           (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));

            /* 'taint' result as complex -- this is conservative,
             * as lookaheads do not backtrack.
             */
            res_charlen = -1;
            break;
        }
        case DUK_RETOK_ATOM_PERIOD: {
            new_atom_char_length = 1;
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx, DUK_REOP_PERIOD);
            break;
        }
        case DUK_RETOK_ATOM_CHAR: {
            /* Note: successive characters could be joined into string matches
             * but this is not trivial (consider e.g. '/xyz+/); see docs for
             * more discussion.
             */
            duk_uint32_t ch;

            new_atom_char_length = 1;
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx, DUK_REOP_CHAR);
            ch = re_ctx->curr_token.num;
            if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
                ch = duk_unicode_re_canonicalize_char(re_ctx->thr, ch);
            }
            duk__append_u32(re_ctx, ch);
            break;
        }
        case DUK_RETOK_ATOM_DIGIT:
        case DUK_RETOK_ATOM_NOT_DIGIT: {
            new_atom_char_length = 1;
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx,
                            (re_ctx->curr_token.t == DUK_RETOK_ATOM_DIGIT) ?
                            DUK_REOP_RANGES : DUK_REOP_INVRANGES);
            duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)));
            duk__append_u16_list(re_ctx, duk_unicode_re_ranges_digit, sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
            break;
        }
        case DUK_RETOK_ATOM_WHITE:
        case DUK_RETOK_ATOM_NOT_WHITE: {
            new_atom_char_length = 1;
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx,
                            (re_ctx->curr_token.t == DUK_RETOK_ATOM_WHITE) ?
                            DUK_REOP_RANGES : DUK_REOP_INVRANGES);
            duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)));
            duk__append_u16_list(re_ctx, duk_unicode_re_ranges_white, sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
            break;
        }
        case DUK_RETOK_ATOM_WORD_CHAR:
        case DUK_RETOK_ATOM_NOT_WORD_CHAR: {
            new_atom_char_length = 1;
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx,
                            (re_ctx->curr_token.t == DUK_RETOK_ATOM_WORD_CHAR) ?
                            DUK_REOP_RANGES : DUK_REOP_INVRANGES);
            duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t)));
            duk__append_u16_list(re_ctx, duk_unicode_re_ranges_wordchar, sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
            break;
        }
        case DUK_RETOK_ATOM_BACKREFERENCE: {
            duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;
            if (backref > re_ctx->highest_backref) {
                re_ctx->highest_backref = backref;
            }
            new_atom_char_length = -1;   /* mark as complex */
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx, DUK_REOP_BACKREFERENCE);
            duk__append_u32(re_ctx, backref);
            break;
        }
        case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {
            duk_uint32_t cap;

            new_atom_char_length = -1;   /* mark as complex (capture handling) */
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            cap = ++re_ctx->captures;
            duk__append_u32(re_ctx, DUK_REOP_SAVE);
            duk__append_u32(re_ctx, cap * 2);
            duk__parse_disjunction(re_ctx, 0, &tmp_disj);  /* retval (sub-atom char length) unused, tainted as complex above */
            duk__append_u32(re_ctx, DUK_REOP_SAVE);
            duk__append_u32(re_ctx, cap * 2 + 1);
            break;
        }
        case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__parse_disjunction(re_ctx, 0, &tmp_disj);
            new_atom_char_length = tmp_disj.charlen;
            break;
        }
        case DUK_RETOK_ATOM_START_CHARCLASS:
        case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {
            /*
             *  Range parsing is done with a special lexer function which calls
             *  us for every range parsed.  This is different from how rest of
             *  the parsing works, but avoids a heavy, arbitrary size intermediate
             *  value type to hold the ranges.
             *
             *  Another complication is the handling of character ranges when
             *  case insensitive matching is used (see docs for discussion).
             *  The range handler callback given to the lexer takes care of this
             *  as well.
             *
             *  Note that duplicate ranges are not eliminated when parsing character
             *  classes, so that canonicalization of
             *
             *    [0-9a-fA-Fx-{]
             *
             *  creates the result (note the duplicate ranges):
             *
             *    [0-9A-FA-FX-Z{-{]
             *
             *  where [x-{] is split as a result of canonicalization.  The duplicate
             *  ranges are not a semantics issue: they work correctly.
             */

            duk_uint32_t offset;

            DUK_DD(DUK_DDPRINT("character class"));

            /* insert ranges instruction, range count patched in later */
            new_atom_char_length = 1;
            new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
            duk__append_u32(re_ctx,
                            (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?
                            DUK_REOP_RANGES : DUK_REOP_INVRANGES);
            offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);    /* patch in range count later */

            /* parse ranges until character class ends */
            re_ctx->nranges = 0;    /* note: ctx-wide temporary */
            duk_lexer_parse_re_ranges(&re_ctx->lex, duk__generate_ranges, (void *) re_ctx);

            /* insert range count */
            duk__insert_u32(re_ctx, offset, re_ctx->nranges);
            break;
        }
        case DUK_RETOK_ATOM_END_GROUP: {
            if (expect_eof) {
                DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);
            }
            goto done;
        }
        case DUK_RETOK_EOF: {
            if (!expect_eof) {
                DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);
            }
            goto done;
        }
        default: {
            DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);
        }
        }

        /* a complex (new) atom taints the result */
        if (new_atom_start_offset >= 0) {
            if (new_atom_char_length < 0) {
                res_charlen = -1;
            } else if (res_charlen >= 0) {
                /* only advance if not tainted */
                res_charlen += new_atom_char_length;
            }
        }

        /* record previous atom info in case next token is a quantifier */
        atom_start_offset = new_atom_start_offset;
        atom_char_length = new_atom_char_length;
        atom_start_captures = new_atom_start_captures;
    }

done:

    /* finish up pending jump and split for last alternative */
    if (unpatched_disjunction_jump >= 0) {
        duk_uint32_t offset;

        DUK_ASSERT(unpatched_disjunction_split >= 0);
        offset = unpatched_disjunction_jump;
        offset += duk__insert_jump_offset(re_ctx,
                                          offset,
                                          (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
        /* offset is now target of the pending split (right after jump) */
        duk__insert_jump_offset(re_ctx,
                                unpatched_disjunction_split,
                                offset - unpatched_disjunction_split);
    }

#if 0
    out_atom_info->end_captures = re_ctx->captures;
#endif
    out_atom_info->charlen = res_charlen;
    DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
                         (long) out_atom_info->charlen));

    re_ctx->recursion_depth--;
}
コード例 #6
0
/* XXX: return type should probably be duk_size_t, or explicit checks are needed for
 * maximum size.
 */
DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
    duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
    duk_small_int_t len;

    len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
    DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, len);
    return (duk_uint32_t) len;
}

DUK_LOCAL duk_uint32_t duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
    duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
    duk_small_int_t len;

    len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
    DUK_BW_WRITE_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, buf, len);
    return (duk_uint32_t) len;
}

DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
    return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
}

#if 0  /* unused */
DUK_LOCAL duk_uint32_t duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
    return duk__append_u32(re_ctx, duk__encode_i32(x));
}
コード例 #7
0
/* XXX: return type should probably be duk_size_t, or explicit checks are needed for
 * maximum size.
 */
DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
	duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
	duk_small_int_t len;

	len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
	DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, len);
	return (duk_uint32_t) len;
}

DUK_LOCAL void duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
	DUK_BW_WRITE_ENSURE_XUTF8(re_ctx->thr, &re_ctx->bw, x);
}

DUK_LOCAL void duk__append_7bit(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
#if defined(DUK_USE_PREFER_SIZE)
	duk__append_u32(re_ctx, x);
#else
	DUK_ASSERT(x <= 0x7fU);
	DUK_BW_WRITE_ENSURE_U8(re_ctx->thr, &re_ctx->bw, (duk_uint8_t) x);
#endif
}

#if 0
DUK_LOCAL void duk__append_2bytes(duk_re_compiler_ctx *re_ctx, duk_uint8_t x, duk_uint8_t y) {
	DUK_BW_WRITE_ENSURE_U8_2(re_ctx->thr, &re_ctx->bw, x, y);
}
#endif

DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
	return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
}

DUK_LOCAL void duk__append_reop(duk_re_compiler_ctx *re_ctx, duk_uint32_t reop) {
	DUK_ASSERT(reop <= 0x7fU);
	(void) duk__append_7bit(re_ctx, reop);
}

#if 0  /* unused */
DUK_LOCAL void duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
	duk__append_u32(re_ctx, duk__encode_i32(x));
}