static VALUE optimized_unescape(VALUE str, VALUE encoding) { long i, len, beg = 0; VALUE dest = 0; const char *cstr; int cr, origenc, encidx = rb_to_encoding_index(encoding); len = RSTRING_LEN(str); cstr = RSTRING_PTR(str); for (i = 0; i < len; ++i) { char buf[1]; const char c = cstr[i]; int clen = 0; if (c == '%') { if (i + 3 > len) break; if (!ISXDIGIT(cstr[i+1])) continue; if (!ISXDIGIT(cstr[i+2])) continue; buf[0] = ((char_to_number(cstr[i+1]) << 4) | char_to_number(cstr[i+2])); clen = 2; } else if (c == '+') { buf[0] = ' '; } else { continue; } if (!dest) { dest = rb_str_buf_new(len); } rb_str_cat(dest, cstr + beg, i - beg); i += clen; beg = i + 1; rb_str_cat(dest, buf, 1); } if (dest) { rb_str_cat(dest, cstr + beg, len - beg); preserve_original_state(str, dest); cr = ENC_CODERANGE_UNKNOWN; } else { dest = rb_str_dup(str); cr = ENC_CODERANGE(str); } origenc = rb_enc_get_index(str); if (origenc != encidx) { rb_enc_associate_index(dest, encidx); if (!ENC_CODERANGE_CLEAN_P(rb_enc_str_coderange(dest))) { rb_enc_associate_index(dest, origenc); if (cr != ENC_CODERANGE_UNKNOWN) ENC_CODERANGE_SET(dest, cr); } } return dest; }
token_sp lexer::read_next_token() { for (;;) { skip_whitespace(); if (eof_char()) return token_sp(); char const* lex_start = pos; if (is_ascii_whitespace(peek_char())) { advance_char(); while (!eof_char() && is_ascii_whitespace(peek_char())) advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::whitespace); } else if (is_single_line_comment_start()) { advance_char(2); for (;;) { if (eof_char() || is_single_line_comment_end()) { advance_char(); break; } else advance_char(); } } else if (is_multi_line_comment_start()) { advance_char(2); for (;;) { if (eof_char()) { error_sink->push(error_tag(text_range::make_empty(pos), "unterminated comment")); break; } else if (is_multi_line_comment_end()) { advance_char(2); break; } else advance_char(); } } else if (is_raw_string_literal_start()) { advance_char(2); std::string prefix; std::string value; for (;;) { if (eof_char() || peek_char() == ' ' || peek_char() == ')' || peek_char() == '\t' || peek_char() == '\v' || peek_char() == '\r' || peek_char() == '\n') { text_range r(lex_start, pos); error_sink->push(error_tag(r, "expected '(' in raw string literal")); return make_unique<string_literal_token>(r, std::move(value)); } else if (peek_char() == '(') { advance_char(); break; } else { prefix += peek_char(); advance_char(); } } for (;;) { if (eof_char()) { text_range r(lex_start, pos); error_sink->push(error_tag(r, "unterminated string")); return make_unique<string_literal_token>(r, std::move(value)); } else if (is_raw_string_literal_end(prefix)) { // as raw-string-literal-end should begins with ')' and std::equals is short-circuited, // raw-string-literal should be lexed in linear time advance_char(2 + prefix.size()); return make_unique<string_literal_token>(text_range(lex_start, pos), std::move(value)); } else { value += peek_char(); advance_char(); } } } else if (is_identifier_start(peek_char())) { std::string s(1, peek_char()); advance_char(); while (!eof_char() && is_identifier_trail(peek_char())) { s += peek_char(); advance_char(); } return make_identifier_token(text_range(lex_start, pos), std::move(s)); } else if (is_number(peek_char())) { int value = char_to_number(peek_char()); advance_char(); while (!eof_char() && is_number(peek_char())) { value = value * 10 + char_to_number(peek_char()); advance_char(); } return make_unique<integer_literal_token>(text_range(lex_start, pos), value); } else if (peek_char() == '\"') { advance_char(); std::string value; for (;;) { if (eof_char() || peek_char() == '\n') { text_range r(lex_start, pos); error_sink->push(error_tag(r, "unterminated string")); return make_unique<string_literal_token>(r, std::move(value)); } else if (peek_char() == '\"') { advance_char(); return make_unique<string_literal_token>(text_range(lex_start, pos), std::move(value)); } else if (peek_char() == '\\') { const char* escape_start = pos; advance_char(); if (!eof_char()) { switch (peek_char()) { case 'a': value += '\a'; break; case 'b': value += '\b'; break; case 'f': value += '\f'; break; case 'n': value += '\n'; break; case 'r': value += '\r'; break; case 't': value += '\t'; break; case 'v': value += '\v'; break; case '\\': value += '\\'; break; case '\'': value += '\''; break; case '\"': value += '\"'; break; default: error_sink->push(error_tag(text_range(escape_start, pos + 1), "invalid escape character")); value += '\\'; value += peek_char(); break; } advance_char(); } } else { value += peek_char(); advance_char(); } } } else if (peek_char() == '{') { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::lbrace); } else if (peek_char() == '}') { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::rbrace); } else if (peek_char() == '=') { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::equals); } else if (peek_char() == ';') { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::semicolon); } else if (peek_char() == ',') { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::comma); } else if (peek_char() == ':') { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::colon); } else { advance_char(); return make_unique<simple_token>(text_range(lex_start, pos), token_type::unknown); } } }