/** * Insert a new bytecode to the bytecode container */ void re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ size_t offset, /**< distance from the start of the container */ uint8_t *bytecode_p, /**< input bytecode */ size_t length) /**< length of input */ { JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); uint8_t *current_p = bc_ctx_p->current_p; if (current_p + length > bc_ctx_p->block_end_p) { re_realloc_regexp_bytecode_block (bc_ctx_p); } uint8_t *src_p = bc_ctx_p->block_start_p + offset; if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0) { uint8_t *dest_p = src_p + length; uint8_t *tmp_block_start_p; tmp_block_start_p = (uint8_t *) jmem_heap_alloc_block (re_get_bytecode_length (bc_ctx_p) - offset); memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset)); memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset)); jmem_heap_free_block (tmp_block_start_p, re_get_bytecode_length (bc_ctx_p) - offset); } memcpy (src_p, bytecode_p, length); bc_ctx_p->current_p += length; } /* re_bytecode_list_insert */
/** * Enclose the given bytecode to a group */ static void re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ uint32_t group_start_offset, /**< offset of group start */ uint32_t idx, /**< index of group */ bool is_capturable) /**< is capturable group */ { uint32_t qmin, qmax; re_opcode_t start_opcode = re_get_start_opcode_type (re_ctx_p, is_capturable); re_opcode_t end_opcode = re_get_end_opcode_type (re_ctx_p, is_capturable); uint32_t start_head_offset_len; qmin = re_ctx_p->current_token.qmin; qmax = re_ctx_p->current_token.qmax; JERRY_ASSERT (qmin <= qmax); start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx); re_insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode); start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - start_head_offset_len; re_append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode); re_append_u32 (re_ctx_p->bytecode_ctx_p, idx); re_append_u32 (re_ctx_p->bytecode_ctx_p, qmin); re_append_u32 (re_ctx_p->bytecode_ctx_p, qmax); group_start_offset += start_head_offset_len; re_append_jump_offset (re_ctx_p->bytecode_ctx_p, re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START) { re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); } } /* re_insert_into_group */
/** * Insert simple atom iterator */ static void re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ uint32_t new_atom_start_offset) /**< atom start offset */ { uint32_t atom_code_length; uint32_t offset; uint32_t qmin, qmax; qmin = re_ctx_p->current_token.qmin; qmax = re_ctx_p->current_token.qmax; JERRY_ASSERT (qmin <= qmax); /* FIXME: optimize bytecode length. Store 0 rather than INF */ re_append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */ uint32_t bytecode_length = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset); offset = new_atom_start_offset; re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length); re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax); re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin); if (re_ctx_p->current_token.greedy) { re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR); } else { re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR); } } /* re_insert_simple_iterator */
/** * Enclose the given bytecode to a group and inster jump value */ static void re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ uint32_t group_start_offset, /**< offset of group start */ uint32_t idx, /**< index of group */ bool is_capturable) /**< is capturable group */ { re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable); } /* re_insert_into_group_with_jump */
/** * Parse alternatives * * @return completion value * Returned value must be freed with ecma_free_completion_value */ static ecma_completion_value_t re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ bool expect_eof) /**< expect end of file */ { uint32_t idx; re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p; ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); uint32_t alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); while (true) { ECMA_TRY_CATCH (empty, re_parse_next_token (re_ctx_p->parser_ctx_p, &(re_ctx_p->current_token)), ret_value); ECMA_FINALIZE (empty); if (!ecma_is_completion_value_empty (ret_value)) { return ret_value; /* error */ } uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); switch (re_ctx_p->current_token.type) { case RE_TOK_START_CAPTURE_GROUP: { idx = re_ctx_p->num_of_captures++; JERRY_DDLOG ("Compile a capture group start (idx: %d)\n", idx); ret_value = re_parse_alternative (re_ctx_p, false); if (ecma_is_completion_value_empty (ret_value)) { re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, true); } else { return ret_value; /* error */ } break; } case RE_TOK_START_NON_CAPTURE_GROUP: { idx = re_ctx_p->num_of_non_captures++; JERRY_DDLOG ("Compile a non-capture group start (idx: %d)\n", idx); ret_value = re_parse_alternative (re_ctx_p, false); if (ecma_is_completion_value_empty (ret_value)) { re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, false); } else { return ret_value; /* error */ } break; } case RE_TOK_CHAR: { JERRY_DDLOG ("Compile character token: %c, qmin: %d, qmax: %d\n", re_ctx_p->current_token.value, re_ctx_p->current_token.qmin, re_ctx_p->current_token.qmax); re_append_opcode (bc_ctx_p, RE_OP_CHAR); re_append_u32 (bc_ctx_p, re_canonicalize ((ecma_char_t) re_ctx_p->current_token.value, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) { re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } break; } case RE_TOK_PERIOD: { JERRY_DDLOG ("Compile a period\n"); re_append_opcode (bc_ctx_p, RE_OP_PERIOD); if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) { re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } break; } case RE_TOK_ALTERNATIVE: { JERRY_DDLOG ("Compile an alternative\n"); re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE); alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); break; } case RE_TOK_ASSERT_START: { JERRY_DDLOG ("Compile a start assertion\n"); re_append_opcode (bc_ctx_p, RE_OP_ASSERT_START); break; } case RE_TOK_ASSERT_END: { JERRY_DDLOG ("Compile an end assertion\n"); re_append_opcode (bc_ctx_p, RE_OP_ASSERT_END); break; } case RE_TOK_ASSERT_WORD_BOUNDARY: { JERRY_DDLOG ("Compile a word boundary assertion\n"); re_append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY); break; } case RE_TOK_ASSERT_NOT_WORD_BOUNDARY: { JERRY_DDLOG ("Compile a not word boundary assertion\n"); re_append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY); break; } case RE_TOK_ASSERT_START_POS_LOOKAHEAD: { JERRY_DDLOG ("Compile a positive lookahead assertion\n"); idx = re_ctx_p->num_of_non_captures++; re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS); ret_value = re_parse_alternative (re_ctx_p, false); if (ecma_is_completion_value_empty (ret_value)) { re_append_opcode (bc_ctx_p, RE_OP_MATCH); re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); } else { return ret_value; /* error */ } break; } case RE_TOK_ASSERT_START_NEG_LOOKAHEAD: { JERRY_DDLOG ("Compile a negative lookahead assertion\n"); idx = re_ctx_p->num_of_non_captures++; re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG); ret_value = re_parse_alternative (re_ctx_p, false); if (ecma_is_completion_value_empty (ret_value)) { re_append_opcode (bc_ctx_p, RE_OP_MATCH); re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); } else { return ret_value; /* error */ } break; } case RE_TOK_BACKREFERENCE: { uint32_t backref = (uint32_t) re_ctx_p->current_token.value; idx = re_ctx_p->num_of_non_captures++; if (backref > re_ctx_p->highest_backref) { re_ctx_p->highest_backref = backref; } JERRY_DDLOG ("Compile a backreference: %d\n", backref); re_append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE); re_append_u32 (bc_ctx_p, backref); re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); break; } case RE_TOK_DIGIT: case RE_TOK_NOT_DIGIT: case RE_TOK_WHITE: case RE_TOK_NOT_WHITE: case RE_TOK_WORD_CHAR: case RE_TOK_NOT_WORD_CHAR: case RE_TOK_START_CHAR_CLASS: case RE_TOK_START_INV_CHAR_CLASS: { JERRY_DDLOG ("Compile a character class\n"); re_append_opcode (bc_ctx_p, re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS ? RE_OP_INV_CHAR_CLASS : RE_OP_CHAR_CLASS); uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); ECMA_TRY_CATCH (empty, re_parse_char_class (re_ctx_p->parser_ctx_p, re_append_char_class, re_ctx_p, &(re_ctx_p->current_token)), ret_value); re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes); if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) { re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } ECMA_FINALIZE (empty); if (ecma_is_completion_value_throw (ret_value)) { return ret_value; /* error */ } break; } case RE_TOK_END_GROUP: { JERRY_DDLOG ("Compile a group end\n"); if (expect_eof) { ret_value = ecma_raise_syntax_error ("Unexpected end of paren."); } else { re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); } return ret_value; } case RE_TOK_EOF: { if (!expect_eof) { ret_value = ecma_raise_syntax_error ("Unexpected end of pattern."); } else { re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); } return ret_value; } default: { ret_value = ecma_raise_syntax_error ("Unexpected RegExp token."); return ret_value; } } } JERRY_UNREACHABLE (); return ret_value; } /* re_parse_alternative */