static lit_utf8_size_t generate_cesu8_char (utf8_char_size char_size, lit_utf8_byte_t *buf) { JERRY_ASSERT (char_size >= 0 && char_size <= LIT_CESU8_MAX_BYTES_IN_CODE_UNIT); lit_code_point_t code_point = (lit_code_point_t) rand (); if (char_size == 1) { code_point %= LIT_UTF8_1_BYTE_CODE_POINT_MAX; } else if (char_size == 2) { code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_2_BYTE_CODE_POINT_MAX - LIT_UTF8_2_BYTE_CODE_POINT_MIN); } else if (char_size == 3) { code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_3_BYTE_CODE_POINT_MAX - LIT_UTF8_3_BYTE_CODE_POINT_MIN); } else { code_point %= LIT_UTF8_3_BYTE_CODE_POINT_MAX; } if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN && code_point <= LIT_UTF16_LOW_SURROGATE_MAX) { code_point = LIT_UTF16_HIGH_SURROGATE_MIN - 1; } return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf); } /* generate_cesu8_char */
/** * Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit * * @return pointer to ecma-string descriptor */ ecma_string_t * ecma_new_ecma_string_from_code_unit (ecma_char_t code_unit) /**< code unit */ { lit_utf8_byte_t lit_utf8_bytes[LIT_UTF8_MAX_BYTES_IN_CODE_UNIT]; lit_utf8_size_t bytes_size = lit_code_unit_to_utf8 (code_unit, lit_utf8_bytes); return ecma_new_ecma_string_from_utf8 (lit_utf8_bytes, bytes_size); } /* ecma_new_ecma_string_from_code_unit */
/** * The String object's 'fromCharCode' routine * * See also: * ECMA-262 v5, 15.5.3.2 * * @return ecma value * Returned value must be freed with ecma_free_value. */ static ecma_value_t ecma_builtin_string_object_from_char_code (ecma_value_t this_arg, /**< 'this' argument */ const ecma_value_t args[], /**< arguments list */ ecma_length_t args_number) /**< number of arguments */ { JERRY_UNUSED (this_arg); if (args_number == 0) { return ecma_make_magic_string_value (LIT_MAGIC_STRING__EMPTY); } ecma_value_t ret_value = ECMA_VALUE_EMPTY; ecma_string_t *ret_string_p = NULL; lit_utf8_size_t utf8_buf_size = args_number * LIT_CESU8_MAX_BYTES_IN_CODE_UNIT; JMEM_DEFINE_LOCAL_ARRAY (utf8_buf_p, utf8_buf_size, lit_utf8_byte_t); lit_utf8_size_t utf8_buf_used = 0; for (ecma_length_t arg_index = 0; arg_index < args_number && ecma_is_value_empty (ret_value); arg_index++) { ECMA_OP_TO_NUMBER_TRY_CATCH (arg_num, args[arg_index], ret_value); uint32_t uint32_char_code = ecma_number_to_uint32 (arg_num); ecma_char_t code_unit = (uint16_t) uint32_char_code; JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT); utf8_buf_used += lit_code_unit_to_utf8 (code_unit, utf8_buf_p + utf8_buf_used); JERRY_ASSERT (utf8_buf_used <= utf8_buf_size); ECMA_OP_TO_NUMBER_FINALIZE (arg_num); } if (ecma_is_value_empty (ret_value)) { ret_string_p = ecma_new_ecma_string_from_utf8 (utf8_buf_p, utf8_buf_used); } JMEM_FINALIZE_LOCAL_ARRAY (utf8_buf_p); if (ecma_is_value_empty (ret_value)) { ret_value = ecma_make_string_value (ret_string_p); } return ret_value; } /* ecma_builtin_string_object_from_char_code */
/** * The String object's 'fromCharCode' routine * * See also: * ECMA-262 v5, 15.5.3.2 * * @return completion value * Returned value must be freed with ecma_free_completion_value. */ static ecma_completion_value_t ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused___, /**< 'this' argument */ const ecma_value_t args[], /**< arguments list */ ecma_length_t args_number) /**< number of arguments */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); if (args_number == 0) { ecma_string_t *ret_str_p = ecma_new_ecma_string_from_utf8 (NULL, 0); return ecma_make_normal_completion_value (ecma_make_string_value (ret_str_p)); } lit_utf8_size_t utf8_buf_size = args_number * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; ecma_string_t *ret_str_p; MEM_DEFINE_LOCAL_ARRAY (utf8_buf_p, utf8_buf_size, lit_utf8_byte_t); lit_utf8_size_t utf8_buf_used = 0; FIXME ("Support surrogate pairs"); for (ecma_length_t arg_index = 0; arg_index < args_number; arg_index++) { ECMA_OP_TO_NUMBER_TRY_CATCH (arg_num, args[arg_index], ret_value); uint32_t uint32_char_code = ecma_number_to_uint32 (arg_num); ecma_char_t code_unit = (uint16_t) uint32_char_code; JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT); utf8_buf_used += lit_code_unit_to_utf8 (code_unit, utf8_buf_p + utf8_buf_used); JERRY_ASSERT (utf8_buf_used <= utf8_buf_size); ECMA_OP_TO_NUMBER_FINALIZE (arg_num); if (ecma_is_completion_value_throw (ret_value)) { mem_heap_free_block (utf8_buf_p); return ret_value; } JERRY_ASSERT (ecma_is_completion_value_empty (ret_value)); } ret_str_p = ecma_new_ecma_string_from_utf8 (utf8_buf_p, utf8_buf_used); MEM_FINALIZE_LOCAL_ARRAY (utf8_buf_p); return ecma_make_normal_completion_value (ecma_make_string_value (ret_str_p)); } /* ecma_builtin_string_object_from_char_code */
/** * Convert specified string to token of specified type, transforming escape sequences * * @return token descriptor */ static token convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of token to produce */ const jerry_api_char_t *source_str_p, /**< string to convert, * located in source buffer */ size_t source_str_size) /**< size of the string */ { token ret; if (source_str_size == 0) { return convert_string_to_token (tok_type, lit_get_magic_string_utf8 (LIT_MAGIC_STRING__EMPTY), 0); } else { JERRY_ASSERT (source_str_p != NULL); } lit_utf8_byte_t *str_buf_p = (lit_utf8_byte_t*) jsp_mm_alloc (source_str_size); const lit_utf8_byte_t *source_str_iter_p = source_str_p; lit_utf8_byte_t *str_buf_iter_p = str_buf_p; bool is_correct_sequence = true; bool every_char_islower = true; bool every_char_allowed_in_identifier = true; while (source_str_iter_p < source_str_p + source_str_size) { ecma_char_t converted_char; if (*source_str_iter_p != '\\') { converted_char = (lit_utf8_byte_t) *source_str_iter_p++; JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size); JERRY_ASSERT (source_str_iter_p <= source_str_p + source_str_size); } else { source_str_iter_p++; const lit_utf8_byte_t escape_character = (lit_utf8_byte_t) *source_str_iter_p++; JERRY_ASSERT (source_str_iter_p <= source_str_p + source_str_size); if (isdigit (escape_character)) { if (escape_character == '0') { JERRY_UNIMPLEMENTED ("<NUL> character is not currently supported.\n"); } else { /* Implementation-defined (ECMA-262 v5, B.1.2): octal escape sequences are not implemented */ is_correct_sequence = false; break; } } else if (escape_character == 'u' || escape_character == 'x') { const uint32_t hex_chars_num = (escape_character == 'u' ? 4u : 2u); if (source_str_iter_p + hex_chars_num > source_str_p + source_str_size) { is_correct_sequence = false; break; } bool chars_are_hex = true; uint16_t char_code = 0; for (uint32_t i = 0; i < hex_chars_num; i++) { const lit_utf8_byte_t byte = (lit_utf8_byte_t) *source_str_iter_p++; if (!isxdigit (byte)) { chars_are_hex = false; break; } else { /* * Check that highest 4 bits are zero, so the value would not overflow. */ JERRY_ASSERT ((char_code & 0xF000u) == 0); char_code = (uint16_t) (char_code << 4u); char_code = (uint16_t) (char_code + ecma_char_hex_to_int (byte)); } } JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size); JERRY_ASSERT (source_str_iter_p <= source_str_p + source_str_size); if (!chars_are_hex) { is_correct_sequence = false; break; } /* * In CONFIG_ECMA_CHAR_ASCII mode size of ecma_char_t is 1 byte, so the conversion * would ignore highest part of 2-byte value, and in CONFIG_ECMA_CHAR_UTF16 mode this * would be just an assignment of 2-byte value. */ converted_char = (ecma_char_t) char_code; } else if (ecma_char_is_line_terminator (escape_character)) { if (source_str_iter_p + 1 <= source_str_p + source_str_size) { lit_utf8_byte_t byte = *source_str_iter_p; if (escape_character == '\x0D' && byte == '\x0A') { source_str_iter_p++; } } continue; } else { convert_single_escape_character ((ecma_char_t) escape_character, &converted_char); } } TODO ("Support surrogate paris.") str_buf_iter_p += lit_code_unit_to_utf8 (converted_char, str_buf_iter_p); JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size); if (!islower (converted_char)) { every_char_islower = false; if (!isalpha (converted_char) && !isdigit (converted_char) && converted_char != '$' && converted_char != '_') { every_char_allowed_in_identifier = false; } } } if (is_correct_sequence) { lit_utf8_size_t length = (lit_utf8_size_t) (str_buf_iter_p - str_buf_p); ret = empty_token; if (tok_type == TOK_NAME) { if (every_char_islower) { ret = decode_keyword (str_buf_p, length); } else if (!every_char_allowed_in_identifier) { PARSE_ERROR ("Malformed identifier name", source_str_p - buffer_start); } } if (is_empty (ret)) { ret = convert_string_to_token (tok_type, str_buf_p, length); } } else { PARSE_ERROR ("Malformed escape sequence", source_str_p - buffer_start); } jsp_mm_free (str_buf_p); return ret; } /* convert_string_to_token_transform_escape_seq */
int main (int __attr_unused___ argc, char __attr_unused___ **argv) { TEST_INIT (); jmem_init (); lit_init (); ecma_init (); lit_utf8_byte_t cesu8_string[max_bytes_in_string]; ecma_char_t code_units[max_code_units_in_string]; const lit_utf8_byte_t *saved_positions[max_code_units_in_string]; for (int i = 0; i < test_iters; i++) { lit_utf8_size_t cesu8_string_size = (i == 0) ? 0 : (lit_utf8_size_t) (rand () % max_bytes_in_string); ecma_length_t length = generate_cesu8_string (cesu8_string, cesu8_string_size); ecma_string_t *char_collection_string_p = ecma_new_ecma_string_from_utf8 (cesu8_string, cesu8_string_size); ecma_length_t char_collection_len = ecma_string_get_length (char_collection_string_p); JERRY_ASSERT (char_collection_len == length); ecma_deref_ecma_string (char_collection_string_p); JERRY_ASSERT (lit_utf8_string_length (cesu8_string, cesu8_string_size) == length); const lit_utf8_byte_t *curr_p = cesu8_string; const lit_utf8_byte_t *end_p = cesu8_string + cesu8_string_size; ecma_length_t calculated_length = 0; ecma_length_t code_units_count = 0; while (curr_p < end_p) { code_units[code_units_count] = lit_utf8_peek_next (curr_p); saved_positions[code_units_count] = curr_p; code_units_count++; calculated_length++; lit_utf8_incr (&curr_p); } JERRY_ASSERT (length == calculated_length); if (code_units_count > 0) { for (int j = 0; j < test_subiters; j++) { ecma_length_t index = (ecma_length_t) rand () % code_units_count; curr_p = saved_positions[index]; JERRY_ASSERT (lit_utf8_peek_next (curr_p) == code_units[index]); } } curr_p = (lit_utf8_byte_t *) end_p; while (curr_p > cesu8_string) { JERRY_ASSERT (code_units_count > 0); calculated_length--; JERRY_ASSERT (code_units[calculated_length] == lit_utf8_peek_prev (curr_p)); lit_utf8_decr (&curr_p); } JERRY_ASSERT (calculated_length == 0); while (curr_p < end_p) { ecma_char_t code_unit = lit_utf8_read_next (&curr_p); JERRY_ASSERT (code_unit == code_units[calculated_length]); calculated_length++; } JERRY_ASSERT (length == calculated_length); while (curr_p > cesu8_string) { JERRY_ASSERT (code_units_count > 0); calculated_length--; JERRY_ASSERT (code_units[calculated_length] == lit_utf8_read_prev (&curr_p)); } JERRY_ASSERT (calculated_length == 0); } /* Overlong-encoded code point */ lit_utf8_byte_t invalid_cesu8_string_1[] = {0xC0, 0x82}; JERRY_ASSERT (!lit_is_cesu8_string_valid (invalid_cesu8_string_1, sizeof (invalid_cesu8_string_1))); /* Overlong-encoded code point */ lit_utf8_byte_t invalid_cesu8_string_2[] = {0xE0, 0x80, 0x81}; JERRY_ASSERT (!lit_is_cesu8_string_valid (invalid_cesu8_string_2, sizeof (invalid_cesu8_string_2))); /* Pair of surrogates: 0xD901 0xDFF0 which encode Unicode character 0x507F0 */ lit_utf8_byte_t invalid_cesu8_string_3[] = {0xED, 0xA4, 0x81, 0xED, 0xBF, 0xB0}; JERRY_ASSERT (lit_is_cesu8_string_valid (invalid_cesu8_string_3, sizeof (invalid_cesu8_string_3))); /* Isolated high surrogate 0xD901 */ lit_utf8_byte_t valid_utf8_string_1[] = {0xED, 0xA4, 0x81}; JERRY_ASSERT (lit_is_cesu8_string_valid (valid_utf8_string_1, sizeof (valid_utf8_string_1))); lit_utf8_byte_t res_buf[3]; lit_utf8_size_t res_size; res_size = lit_code_unit_to_utf8 (0x73, res_buf); JERRY_ASSERT (res_size == 1); JERRY_ASSERT (res_buf[0] == 0x73); res_size = lit_code_unit_to_utf8 (0x41A, res_buf); JERRY_ASSERT (res_size == 2); JERRY_ASSERT (res_buf[0] == 0xD0); JERRY_ASSERT (res_buf[1] == 0x9A); res_size = lit_code_unit_to_utf8 (0xD7FF, res_buf); JERRY_ASSERT (res_size == 3); JERRY_ASSERT (res_buf[0] == 0xED); JERRY_ASSERT (res_buf[1] == 0x9F); JERRY_ASSERT (res_buf[2] == 0xBF); ecma_finalize (); lit_finalize (); jmem_finalize (true); return 0; } /* main */
/** * Helper function to convert a string to upper or lower case. * * @return completion value * Returned value must be freed with ecma_free_completion_value. */ static ecma_completion_value_t ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /**< this argument */ bool lower_case) /**< convert to lower (true) * or upper (false) case */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); /* 1. */ ECMA_TRY_CATCH (check_coercible_val, ecma_op_check_object_coercible (this_arg), ret_value); /* 2. */ ECMA_TRY_CATCH (to_string_val, ecma_op_to_string (this_arg), ret_value); /* 3. */ ecma_string_t *input_string_p = ecma_get_string_from_value (to_string_val); lit_utf8_size_t input_size = ecma_string_get_size (input_string_p); MEM_DEFINE_LOCAL_ARRAY (input_start_p, input_size, lit_utf8_byte_t); ecma_string_to_utf8_string (input_string_p, input_start_p, (ssize_t) (input_size)); /* * The URI encoding has two major phases: first we compute * the length of the lower case string, then we encode it. */ lit_utf8_size_t output_length = 0; lit_utf8_iterator_t input_iterator = lit_utf8_iterator_create (input_start_p, input_size); while (!lit_utf8_iterator_is_eos (&input_iterator)) { ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator); ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; lit_utf8_byte_t utf8_byte_buffer[LIT_UTF8_MAX_BYTES_IN_CODE_POINT]; lit_utf8_size_t character_length; /* * We need to keep surrogate pairs. Surrogates are never converted, * regardless they form a valid pair or not. */ if (lit_is_code_unit_high_surrogate (character)) { ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator); if (lit_is_code_unit_low_surrogate (next_character)) { lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character); output_length += lit_code_point_to_utf8 (surrogate_code_point, utf8_byte_buffer); lit_utf8_iterator_incr (&input_iterator); continue; } } if (lower_case) { character_length = lit_char_to_lower_case (character, character_buffer, LIT_MAXIMUM_OTHER_CASE_LENGTH); } else { character_length = lit_char_to_upper_case (character, character_buffer, LIT_MAXIMUM_OTHER_CASE_LENGTH); } JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH); for (lit_utf8_size_t i = 0; i < character_length; i++) { output_length += lit_code_unit_to_utf8 (character_buffer[i], utf8_byte_buffer); } } /* Second phase. */ MEM_DEFINE_LOCAL_ARRAY (output_start_p, output_length, lit_utf8_byte_t); lit_utf8_byte_t *output_char_p = output_start_p; /* Encoding the output. */ lit_utf8_iterator_seek_bos (&input_iterator); while (!lit_utf8_iterator_is_eos (&input_iterator)) { ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator); ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; lit_utf8_size_t character_length; /* * We need to keep surrogate pairs. Surrogates are never converted, * regardless they form a valid pair or not. */ if (lit_is_code_unit_high_surrogate (character)) { ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator); if (lit_is_code_unit_low_surrogate (next_character)) { lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character); output_char_p += lit_code_point_to_utf8 (surrogate_code_point, output_char_p); lit_utf8_iterator_incr (&input_iterator); continue; } } if (lower_case) { character_length = lit_char_to_lower_case (character, character_buffer, LIT_MAXIMUM_OTHER_CASE_LENGTH); } else { character_length = lit_char_to_upper_case (character, character_buffer, LIT_MAXIMUM_OTHER_CASE_LENGTH); } JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH); for (lit_utf8_size_t i = 0; i < character_length; i++) { output_char_p += lit_code_point_to_utf8 (character_buffer[i], output_char_p); } } JERRY_ASSERT (output_start_p + output_length == output_char_p); ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length); ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p)); MEM_FINALIZE_LOCAL_ARRAY (output_start_p); MEM_FINALIZE_LOCAL_ARRAY (input_start_p); ECMA_FINALIZE (to_string_val); ECMA_FINALIZE (check_coercible_val); return ret_value; } /* ecma_builtin_string_prototype_object_conversion_helper */
/** * Transforming escape sequences in the charset, outputting converted string to specified buffer * * Note: * Size of string with transformed escape sequences is always * less or equal to size of corresponding source string. * * @return size of converted string */ static lit_utf8_size_t lexer_transform_escape_sequences (const jerry_api_char_t *source_str_p, /**< string to convert, * located in source buffer */ lit_utf8_size_t source_str_size, /**< size of the string and of the output buffer */ jerry_api_char_t *output_str_buf_p) /**< output buffer for converted string */ { if (source_str_size == 0) { return 0; } else { JERRY_ASSERT (source_str_p != NULL); } lit_utf8_byte_t *output_str_buf_iter_p = output_str_buf_p; const size_t output_str_buf_size = source_str_size; bool is_correct_sequence = true; lit_utf8_iterator_t source_str_iter = lit_utf8_iterator_create (source_str_p, source_str_size); ecma_char_t prev_converted_char = LIT_CHAR_NULL; while (!lit_utf8_iterator_is_eos (&source_str_iter)) { ecma_char_t converted_char; const ecma_char_t next_char = lit_utf8_iterator_read_next (&source_str_iter); if (next_char == LIT_CHAR_BACKSLASH) { if (lit_utf8_iterator_is_eos (&source_str_iter)) { is_correct_sequence = false; break; } const ecma_char_t char_after_next = lit_utf8_iterator_read_next (&source_str_iter); if (lit_char_is_decimal_digit (char_after_next)) { if (lit_char_is_octal_digit (char_after_next)) { if (char_after_next == LIT_CHAR_0 && (lit_utf8_iterator_is_eos (&source_str_iter) || !lit_char_is_octal_digit (lit_utf8_iterator_peek_next (&source_str_iter)))) { converted_char = LIT_CHAR_NULL; } else { /* Implementation-defined (ECMA-262 v5, B.1.2): octal escape sequences are not implemented */ is_correct_sequence = false; break; } } else { converted_char = char_after_next; } } else if (char_after_next == LIT_CHAR_LOWERCASE_U || char_after_next == LIT_CHAR_LOWERCASE_X) { if (!lexer_convert_escape_sequence_digits_to_char (&source_str_iter, char_after_next == LIT_CHAR_LOWERCASE_U, &converted_char)) { is_correct_sequence = false; break; } } else if (lit_char_is_line_terminator (char_after_next)) { /* Skip \, followed by a LineTerminatorSequence (ECMA-262, v5, 7.3) */ if (char_after_next == LIT_CHAR_CR && !lit_utf8_iterator_is_eos (&source_str_iter) && lit_utf8_iterator_peek_next (&source_str_iter) == LIT_CHAR_LF) { lit_utf8_iterator_incr (&source_str_iter); } continue; } else { lexer_convert_single_escape_character (char_after_next, &converted_char); } } else { converted_char = next_char; } if (lit_is_code_unit_high_surrogate (prev_converted_char) && lit_is_code_unit_low_surrogate (converted_char)) { output_str_buf_iter_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_converted_char, converted_char); output_str_buf_iter_p += lit_code_point_to_utf8 (code_point, output_str_buf_iter_p); } else { output_str_buf_iter_p += lit_code_unit_to_utf8 (converted_char, output_str_buf_iter_p); JERRY_ASSERT (output_str_buf_iter_p <= output_str_buf_p + output_str_buf_size); } prev_converted_char = converted_char; } if (is_correct_sequence) { return (lit_utf8_size_t) (output_str_buf_iter_p - output_str_buf_p); } else { PARSE_ERROR (JSP_EARLY_ERROR_SYNTAX, "Illegal escape sequence", token_start_pos); } } /* lexer_transform_escape_sequences */
/** * Create token of specified type from charset * * @return token descriptor */ static token lexer_create_token_for_charset (jsp_token_type_t tt, /**< token type */ const lit_utf8_byte_t *charset_p, /**< charset buffer */ lit_utf8_size_t size) /**< size of the charset */ { JERRY_ASSERT (charset_p != NULL); lit_utf8_iterator_t iter = lit_utf8_iterator_create (charset_p, (lit_utf8_size_t) size); lit_utf8_size_t new_size = 0; lit_utf8_size_t new_length = 0; bool should_convert = false; while (!lit_utf8_iterator_is_eos (&iter)) { if (iter.buf_pos.is_non_bmp_middle) { should_convert = true; } lit_utf8_iterator_incr (&iter); new_size += LIT_CESU8_MAX_BYTES_IN_CODE_UNIT; } lit_utf8_byte_t *converted_str_p; if (unlikely (should_convert)) { lit_utf8_iterator_seek_bos (&iter); converted_str_p = (lit_utf8_byte_t *) jsp_mm_alloc (new_size); while (!lit_utf8_iterator_is_eos (&iter)) { ecma_char_t ch = lit_utf8_iterator_read_next (&iter); new_length += lit_code_unit_to_utf8 (ch, converted_str_p + new_length); } } else { converted_str_p = (lit_utf8_byte_t *) charset_p; new_length = size; JERRY_ASSERT (lit_is_cesu8_string_valid (converted_str_p, new_length)); } lit_literal_t lit = lit_find_literal_by_utf8_string (converted_str_p, new_length); if (lit != NULL) { if (unlikely (should_convert)) { jsp_mm_free (converted_str_p); } return create_token_from_lit (tt, lit); } lit = lit_create_literal_from_utf8_string (converted_str_p, new_length); rcs_record_type_t type = rcs_record_get_type (lit); JERRY_ASSERT (RCS_RECORD_TYPE_IS_CHARSET (type) || RCS_RECORD_TYPE_IS_MAGIC_STR (type) || RCS_RECORD_TYPE_IS_MAGIC_STR_EX (type)); if (unlikely (should_convert)) { jsp_mm_free (converted_str_p); } return create_token_from_lit (tt, lit); } /* lexer_create_token_for_charset */
int main (int __attr_unused___ argc, char __attr_unused___ **argv) { TEST_INIT (); mem_init (); lit_utf8_byte_t utf8_string[max_bytes_in_string]; ecma_char_t code_units[max_code_units_in_string]; lit_utf8_iterator_pos_t saved_positions[max_code_units_in_string]; for (int i = 0; i < test_iters; i++) { lit_utf8_size_t utf8_string_size = (i == 0) ? 0 : (lit_utf8_size_t) (rand () % max_bytes_in_string); ecma_length_t length = generate_utf8_string (utf8_string, utf8_string_size); JERRY_ASSERT (lit_utf8_string_length (utf8_string, utf8_string_size) == length); lit_utf8_iterator_t iter = lit_utf8_iterator_create (utf8_string, utf8_string_size); ecma_length_t calculated_length = 0; ecma_length_t code_units_count = 0; while (!lit_utf8_iterator_is_eos (&iter)) { code_units[code_units_count] = lit_utf8_iterator_peek_next (&iter); saved_positions[code_units_count] = lit_utf8_iterator_get_pos (&iter); code_units_count++; calculated_length++; lit_utf8_iterator_incr (&iter); } JERRY_ASSERT (length == calculated_length); if (code_units_count > 0) { for (int j = 0; j < test_subiters; j++) { ecma_length_t index = (ecma_length_t) rand () % code_units_count; lit_utf8_iterator_seek (&iter, saved_positions[index]); JERRY_ASSERT (lit_utf8_iterator_peek_next (&iter) == code_units[index]); JERRY_ASSERT (lit_utf8_iterator_get_index (&iter) == index); } } lit_utf8_iterator_seek_eos (&iter); while (!lit_utf8_iterator_is_bos (&iter)) { JERRY_ASSERT (code_units_count > 0); calculated_length--; JERRY_ASSERT (code_units[calculated_length] == lit_utf8_iterator_peek_prev (&iter)); lit_utf8_iterator_decr (&iter); } JERRY_ASSERT (calculated_length == 0); while (!lit_utf8_iterator_is_eos (&iter)) { ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter); JERRY_ASSERT (code_unit == code_units[calculated_length]); calculated_length++; } JERRY_ASSERT (length == calculated_length); while (!lit_utf8_iterator_is_bos (&iter)) { JERRY_ASSERT (code_units_count > 0); calculated_length--; JERRY_ASSERT (code_units[calculated_length] == lit_utf8_iterator_read_prev (&iter)); } JERRY_ASSERT (calculated_length == 0); } /* Overlong-encoded code point */ lit_utf8_byte_t invalid_utf8_string_1[] = {0xC0, 0x82}; JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1))); /* Overlong-encoded code point */ lit_utf8_byte_t invalid_utf8_string_2[] = {0xE0, 0x80, 0x81}; JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_2, sizeof (invalid_utf8_string_2))); /* Pair of surrogates: 0xD901 0xDFF0 which encode Unicode character 0x507F0 */ lit_utf8_byte_t invalid_utf8_string_3[] = {0xED, 0xA4, 0x81, 0xED, 0xBF, 0xB0}; JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_3, sizeof (invalid_utf8_string_3))); /* Isolated high surrogate 0xD901 */ lit_utf8_byte_t valid_utf8_string_1[] = {0xED, 0xA4, 0x81}; JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_1, sizeof (valid_utf8_string_1))); /* 4-byte long utf-8 character - Unicode character 0x507F0 */ lit_utf8_byte_t valid_utf8_string_2[] = {0xF1, 0x90, 0x9F, 0xB0}; JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_2, sizeof (valid_utf8_string_2))); lit_utf8_byte_t buf[] = {0xF0, 0x90, 0x8D, 0x88}; lit_code_point_t code_point; lit_utf8_size_t bytes_count = lit_read_code_point_from_utf8 (buf, sizeof (buf), &code_point); JERRY_ASSERT (bytes_count == 4); JERRY_ASSERT (code_point == 0x10348); lit_utf8_byte_t res_buf[3]; lit_utf8_size_t res_size; res_size = lit_code_unit_to_utf8 (0x73, res_buf); JERRY_ASSERT (res_size == 1); JERRY_ASSERT (res_buf[0] == 0x73); res_size = lit_code_unit_to_utf8 (0x41A, res_buf); JERRY_ASSERT (res_size == 2); JERRY_ASSERT (res_buf[0] == 0xD0); JERRY_ASSERT (res_buf[1] == 0x9A); res_size = lit_code_unit_to_utf8 (0xD7FF, res_buf); JERRY_ASSERT (res_size == 3); JERRY_ASSERT (res_buf[0] == 0xED); JERRY_ASSERT (res_buf[1] == 0x9F); JERRY_ASSERT (res_buf[2] == 0xBF); lit_utf8_byte_t bytes[] = {0xF0, 0x90, 0x8D, 0x88}; lit_utf8_iterator_t iter = lit_utf8_iterator_create (bytes, sizeof (bytes)); ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter); JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter)); JERRY_ASSERT (code_unit == 0xD800); code_unit = lit_utf8_iterator_read_next (&iter); JERRY_ASSERT (lit_utf8_iterator_is_eos (&iter)); JERRY_ASSERT (code_unit == 0xDF48); mem_finalize (true); return 0; }