/** * Parse a max 3 digit long octal number from input string iterator. * * @return uint32_t - parsed octal number */ static uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ { uint32_t number = 0; for (int index = 0; index < 3 && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p && lit_char_is_octal_digit (*parser_ctx_p->input_curr_p); index++) { number = number * 8 + lit_char_hex_to_int (*parser_ctx_p->input_curr_p++); } return number; } /* re_parse_octal */
/** * Transform specified number of hexadecimal digits pointed by string iterator to character code * * @return true - upon successful conversion, * false - otherwise (characters, pointed by iterator, are not hexadecimal digits, * or number of characters until end of string is less than specified). */ static bool lexer_convert_escape_sequence_digits_to_char (lit_utf8_iterator_t *src_iter_p, /**< string iterator */ bool is_unicode_escape_sequence, /**< UnicodeEscapeSequence (true) * or HexEscapeSequence (false) */ ecma_char_t *out_converted_char_p) /**< out: converted character */ { uint16_t char_code = 0; const uint32_t digits_num = is_unicode_escape_sequence ? 4 : 2; for (uint32_t i = 0; i < digits_num; i++) { if (lit_utf8_iterator_is_eos (src_iter_p)) { return false; } const ecma_char_t next_char = lit_utf8_iterator_read_next (src_iter_p); if (!lit_char_is_hex_digit (next_char)) { return false; } else { /* * Check that highest 4 bits are zero, so the value would not overflow. */ JERRY_ASSERT ((char_code & 0xF000u) == 0); char_code = (uint16_t) (char_code << 4u); char_code = (uint16_t) (char_code + lit_char_hex_to_int (next_char)); } } *out_converted_char_p = (ecma_char_t) char_code; return true; } /* lexer_convert_escape_sequence_digits_to_char */
/** * Read the input pattern and parse the next token for the RegExp compiler * * @return completion value * Returned value must be freed with ecma_free_completion_value */ ecma_completion_value_t re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ re_token_t *out_token_p) /**< out: output token */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { out_token_p->type = RE_TOK_EOF; return ret_value; } ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); switch (ch) { case LIT_CHAR_VLINE: { out_token_p->type = RE_TOK_ALTERNATIVE; break; } case LIT_CHAR_CIRCUMFLEX: { out_token_p->type = RE_TOK_ASSERT_START; break; } case LIT_CHAR_DOLLAR_SIGN: { out_token_p->type = RE_TOK_ASSERT_END; break; } case LIT_CHAR_DOT: { out_token_p->type = RE_TOK_PERIOD; ret_value = re_parse_iterator (parser_ctx_p, out_token_p); break; } case LIT_CHAR_BACKSLASH: { if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ecma_raise_syntax_error ("invalid regular experssion"); } out_token_p->type = RE_TOK_CHAR; ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); if (ch == LIT_CHAR_LOWERCASE_B) { out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY; } else if (ch == LIT_CHAR_UPPERCASE_B) { out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY; } else if (ch == LIT_CHAR_LOWERCASE_F) { out_token_p->value = LIT_CHAR_FF; } else if (ch == LIT_CHAR_LOWERCASE_N) { out_token_p->value = LIT_CHAR_LF; } else if (ch == LIT_CHAR_LOWERCASE_T) { out_token_p->value = LIT_CHAR_TAB; } else if (ch == LIT_CHAR_LOWERCASE_R) { out_token_p->value = LIT_CHAR_CR; } else if (ch == LIT_CHAR_LOWERCASE_V) { out_token_p->value = LIT_CHAR_VTAB; } else if (ch == LIT_CHAR_LOWERCASE_C) { if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) { ch = *parser_ctx_p->input_curr_p; if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)) { out_token_p->value = (ch % 32); parser_ctx_p->input_curr_p++; } else { out_token_p->value = LIT_CHAR_BACKSLASH; parser_ctx_p->input_curr_p--; } } else { out_token_p->value = LIT_CHAR_BACKSLASH; parser_ctx_p->input_curr_p--; } } else if (ch == LIT_CHAR_LOWERCASE_X && re_hex_lookup (parser_ctx_p, 2)) { lit_code_point_t code_point; if (!lit_read_code_point_from_hex (parser_ctx_p->input_curr_p, 2, &code_point)) { return ecma_raise_syntax_error ("decode error"); } parser_ctx_p->input_curr_p += 2; out_token_p->value = code_point; } else if (ch == LIT_CHAR_LOWERCASE_U && re_hex_lookup (parser_ctx_p, 4)) { lit_code_point_t code_point; if (!lit_read_code_point_from_hex (parser_ctx_p->input_curr_p, 4, &code_point)) { return ecma_raise_syntax_error ("decode error"); } parser_ctx_p->input_curr_p += 4; out_token_p->value = code_point; } else if (ch == LIT_CHAR_LOWERCASE_D) { out_token_p->type = RE_TOK_DIGIT; break; } else if (ch == LIT_CHAR_UPPERCASE_D) { out_token_p->type = RE_TOK_NOT_DIGIT; break; } else if (ch == LIT_CHAR_LOWERCASE_S) { out_token_p->type = RE_TOK_WHITE; break; } else if (ch == LIT_CHAR_UPPERCASE_S) { out_token_p->type = RE_TOK_NOT_WHITE; break; } else if (ch == LIT_CHAR_LOWERCASE_W) { out_token_p->type = RE_TOK_WORD_CHAR; break; } else if (ch == LIT_CHAR_UPPERCASE_W) { out_token_p->type = RE_TOK_NOT_WORD_CHAR; break; } else if (lit_char_is_decimal_digit (ch)) { if (ch == LIT_CHAR_0) { if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p && lit_char_is_decimal_digit (*parser_ctx_p->input_curr_p)) { return ecma_raise_syntax_error ("RegExp escape pattern error."); } out_token_p->value = LIT_UNICODE_CODE_POINT_NULL; } else { if (parser_ctx_p->num_of_groups == -1) { re_count_num_of_groups (parser_ctx_p); } if (parser_ctx_p->num_of_groups) { parser_ctx_p->input_curr_p--; uint32_t number = 0; int index = 0; do { if (index >= RE_MAX_RE_DECESC_DIGITS) { ret_value = ecma_raise_syntax_error ("RegExp escape pattern error: decimal escape too long."); return ret_value; } if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { break; } ecma_char_t digit = *parser_ctx_p->input_curr_p++; if (!lit_char_is_decimal_digit (digit)) { parser_ctx_p->input_curr_p--; break; } number = number * 10 + lit_char_hex_to_int (digit); index++; } while (true); if ((int) number <= parser_ctx_p->num_of_groups) { out_token_p->type = RE_TOK_BACKREFERENCE; } else /* Invalid backreference, fallback to octal */ { /* Rewind to start of number. */ parser_ctx_p->input_curr_p -= index; /* Try to reparse as octal. */ ecma_char_t digit = *parser_ctx_p->input_curr_p; if (!lit_char_is_octal_digit (digit)) { /* Not octal, keep digit character value. */ number = digit; parser_ctx_p->input_curr_p++; } else { number = re_parse_octal (parser_ctx_p); } } out_token_p->value = number; } else /* Invalid backreference, fallback to octal if possible */ { if (!lit_char_is_octal_digit (ch)) { /* Not octal, keep character value. */ out_token_p->value = ch; } else { parser_ctx_p->input_curr_p--; out_token_p->value = re_parse_octal (parser_ctx_p); } } } } else { out_token_p->value = ch; } ret_value = re_parse_iterator (parser_ctx_p, out_token_p); break; } case LIT_CHAR_LEFT_PAREN: { if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ecma_raise_syntax_error ("Unterminated group"); } if (*parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION) { parser_ctx_p->input_curr_p++; if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ecma_raise_syntax_error ("Invalid group"); } ch = *parser_ctx_p->input_curr_p++; if (ch == LIT_CHAR_EQUALS) { /* (?= */ out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD; } else if (ch == LIT_CHAR_EXCLAMATION) { /* (?! */ out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD; } else if (ch == LIT_CHAR_COLON) { /* (?: */ out_token_p->type = RE_TOK_START_NON_CAPTURE_GROUP; } else { return ecma_raise_syntax_error ("Invalid group"); } } else { /* ( */ out_token_p->type = RE_TOK_START_CAPTURE_GROUP; } break; } case LIT_CHAR_RIGHT_PAREN: { out_token_p->type = RE_TOK_END_GROUP; ret_value = re_parse_iterator (parser_ctx_p, out_token_p); break; } case LIT_CHAR_LEFT_SQUARE: { out_token_p->type = RE_TOK_START_CHAR_CLASS; if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ecma_raise_syntax_error ("invalid character class"); } if (*parser_ctx_p->input_curr_p == LIT_CHAR_CIRCUMFLEX) { out_token_p->type = RE_TOK_START_INV_CHAR_CLASS; parser_ctx_p->input_curr_p++; } break; } case LIT_CHAR_QUESTION: case LIT_CHAR_ASTERISK: case LIT_CHAR_PLUS: case LIT_CHAR_LEFT_BRACE: { return ecma_raise_syntax_error ("Invalid RegExp token."); } case LIT_CHAR_NULL: { out_token_p->type = RE_TOK_EOF; break; } default: { out_token_p->type = RE_TOK_CHAR; out_token_p->value = ch; ret_value = re_parse_iterator (parser_ctx_p, out_token_p); break; } } return ret_value; } /* re_parse_next_token */
/** * Parse RegExp iterators * * @return completion value * Returned value must be freed with ecma_free_completion_value */ static ecma_completion_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ re_token_t *re_token_p) /**< out: output token */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); re_token_p->qmin = 1; re_token_p->qmax = 1; re_token_p->greedy = true; if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ret_value; } ecma_char_t ch = *parser_ctx_p->input_curr_p; switch (ch) { case LIT_CHAR_QUESTION: { parser_ctx_p->input_curr_p++; re_token_p->qmin = 0; re_token_p->qmax = 1; re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); break; } case LIT_CHAR_ASTERISK: { parser_ctx_p->input_curr_p++; re_token_p->qmin = 0; re_token_p->qmax = RE_ITERATOR_INFINITE; re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); break; } case LIT_CHAR_PLUS: { parser_ctx_p->input_curr_p++; re_token_p->qmin = 1; re_token_p->qmax = RE_ITERATOR_INFINITE; re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); break; } case LIT_CHAR_LEFT_BRACE: { parser_ctx_p->input_curr_p++; uint32_t qmin = 0; uint32_t qmax = RE_ITERATOR_INFINITE; uint32_t digits = 0; while (true) { if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ecma_raise_syntax_error ("invalid quantifier"); } ch = *parser_ctx_p->input_curr_p++; if (lit_char_is_decimal_digit (ch)) { if (digits >= ECMA_NUMBER_MAX_DIGITS) { return ecma_raise_syntax_error ("RegExp quantifier error: too many digits."); } digits++; qmin = qmin * 10 + lit_char_hex_to_int (ch); } else if (ch == LIT_CHAR_COMMA) { if (qmax != RE_ITERATOR_INFINITE) { return ecma_raise_syntax_error ("RegExp quantifier error: double comma."); } if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) { return ecma_raise_syntax_error ("invalid quantifier"); } if (*parser_ctx_p->input_curr_p == LIT_CHAR_RIGHT_BRACE) { if (digits == 0) { return ecma_raise_syntax_error ("RegExp quantifier error: missing digits."); } parser_ctx_p->input_curr_p++; re_token_p->qmin = qmin; re_token_p->qmax = RE_ITERATOR_INFINITE; break; } qmax = qmin; qmin = 0; digits = 0; } else if (ch == LIT_CHAR_RIGHT_BRACE) { if (digits == 0) { return ecma_raise_syntax_error ("RegExp quantifier error: missing digits."); } if (qmax != RE_ITERATOR_INFINITE) { re_token_p->qmin = qmax; re_token_p->qmax = qmin; } else { re_token_p->qmin = qmin; re_token_p->qmax = qmin; } break; } else { return ecma_raise_syntax_error ("RegExp quantifier error: unknown char."); } } re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); break; } default: { break; } } JERRY_ASSERT (ecma_is_completion_value_empty (ret_value)); if (re_token_p->qmin > re_token_p->qmax) { ret_value = ecma_raise_syntax_error ("RegExp quantifier error: qmin > qmax."); } return ret_value; } /* re_parse_iterator */
/** * Parse numeric literal (ECMA-262, v5, 7.8.3) * * @return token of TOK_SMALL_INT or TOK_NUMBER types */ static token lexer_parse_number (void) { ecma_char_t c = LA (0); bool is_hex = false; bool is_fp = false; ecma_number_t fp_res = .0; size_t tok_length = 0, i; token known_token; JERRY_ASSERT (lit_char_is_decimal_digit (c) || c == LIT_CHAR_DOT); if (c == LIT_CHAR_0) { if (LA (1) == LIT_CHAR_LOWERCASE_X || LA (1) == LIT_CHAR_UPPERCASE_X) { is_hex = true; } } else if (c == LIT_CHAR_DOT) { JERRY_ASSERT (lit_char_is_decimal_digit (LA (1))); is_fp = true; } if (is_hex) { // Eat up '0x' consume_char (); consume_char (); new_token (); c = LA (0); if (!lit_char_is_hex_digit (c)) { PARSE_ERROR (JSP_EARLY_ERROR_SYNTAX, "Invalid HexIntegerLiteral", lit_utf8_iterator_get_pos (&src_iter)); } do { consume_char (); c = LA (0); } while (lit_char_is_hex_digit (c)); if (lexer_is_char_can_be_identifier_start (c)) { PARSE_ERROR (JSP_EARLY_ERROR_SYNTAX, "Identifier just after integer literal", lit_utf8_iterator_get_pos (&src_iter)); } tok_length = (size_t) (TOK_SIZE ()); const lit_utf8_byte_t *fp_buf_p = TOK_START (); /* token is constructed at end of function */ for (i = 0; i < tok_length; i++) { fp_res = fp_res * 16 + (ecma_number_t) lit_char_hex_to_int (fp_buf_p[i]); } } else { bool is_exp = false; new_token (); // Eat up '.' if (is_fp) { consume_char (); } while (true) { c = LA (0); if (c == LIT_CHAR_DOT) { if (is_fp) { /* token is constructed at end of function */ break; } else { is_fp = true; consume_char (); continue; } } else if (c == LIT_CHAR_LOWERCASE_E || c == LIT_CHAR_UPPERCASE_E) { if (is_exp) { PARSE_ERROR (JSP_EARLY_ERROR_SYNTAX, "Numeric literal shall not contain more than exponential marker ('e' or 'E')", lit_utf8_iterator_get_pos (&src_iter)); } else { is_exp = true; consume_char (); if (LA (0) == LIT_CHAR_MINUS || LA (0) == LIT_CHAR_PLUS) { consume_char (); } continue; } } else if (!lit_char_is_decimal_digit (c)) { if (lexer_is_char_can_be_identifier_start (c)) { PARSE_ERROR (JSP_EARLY_ERROR_SYNTAX, "Numeric literal shall not contain non-numeric characters", lit_utf8_iterator_get_pos (&src_iter)); } /* token is constructed at end of function */ break; } consume_char (); } tok_length = (size_t) (TOK_SIZE ()); if (is_fp || is_exp) { ecma_number_t res = ecma_utf8_string_to_number (TOK_START (), (jerry_api_size_t) tok_length); JERRY_ASSERT (!ecma_number_is_nan (res)); known_token = convert_seen_num_to_token (res); is_token_parse_in_progress = NULL; return known_token; } else if (*TOK_START () == LIT_CHAR_0 && tok_length != 1) { /* Octal integer literals */ if (strict_mode) { PARSE_ERROR (JSP_EARLY_ERROR_SYNTAX, "Octal integer literals are not allowed in strict mode", token_start_pos); } else { /* token is constructed at end of function */ const lit_utf8_byte_t *fp_buf_p = TOK_START (); for (i = 0; i < tok_length; i++) { fp_res = fp_res * 8 + (ecma_number_t) lit_char_hex_to_int (fp_buf_p[i]); } } } else { const lit_utf8_byte_t *fp_buf_p = TOK_START (); /* token is constructed at end of function */ ecma_number_t mult = 1.0f; for (i = tok_length; i > 0; i--, mult *= 10) { fp_res += (ecma_number_t) lit_char_hex_to_int (fp_buf_p[i - 1]) * mult; } } } if (fp_res >= 0 && fp_res <= 255 && (uint8_t) fp_res == fp_res) { known_token = create_token (TOK_SMALL_INT, (uint8_t) fp_res); is_token_parse_in_progress = NULL; return known_token; } else { known_token = convert_seen_num_to_token (fp_res); is_token_parse_in_progress = NULL; return known_token; } } /* lexer_parse_number */