/* * Main function * * The return value determines the result length (kept in the target buffer). * However, if the target buffer is too small, the return value is greater * than tlength. The difference to tlength is the number of unconsumed source * characters at the time the buffer was full. In this case you should resize * the target buffer to the return value and call rcssmin again. Repeat as * often as needed. */ static Py_ssize_t rcssmin(const rchar *source, rchar *target, Py_ssize_t slength, Py_ssize_t tlength, int keep_bang_comments) { rcssmin_ctx_t ctx_, *ctx = &ctx_; const rchar *tstart = target; rchar c; ctx->start = source; ctx->sentinel = source + slength; ctx->tsentinel = target + tlength; ctx->at_group = 0; ctx->in_macie5 = 0; ctx->in_rule = 0; ctx->keep_bang_comments = keep_bang_comments; while (source < ctx->sentinel && target < ctx->tsentinel) { c = *source++; if (RCSSMIN_IS_DULL(c)) { *target++ = c; continue; } else if (RCSSMIN_IS_SPACE(c)) { copy_space(&source, &target, ctx, NEED_SPACE_MAYBE); continue; } switch (c) { /* Escape */ case U('\\'): copy_escape(&source, &target, ctx); continue; /* String */ case U('"'): case U('\''): copy_string(&source, &target, ctx); continue; /* URL */ case U('u'): copy_url(&source, &target, ctx); continue; /* IE7hack */ case U('>'): copy_ie7hack(&source, &target, ctx); continue; /* @-group */ case U('@'): copy_at_group(&source, &target, ctx); continue; /* ; */ case U(';'): copy_semicolon(&source, &target, ctx); continue; /* :first-line|letter followed by [{,] */ /* (apparently needed for IE6) */ case U(':'): copy_first(&source, &target, ctx); continue; /* { */ case U('{'): if (ctx->at_group) --ctx->at_group; else ++ctx->in_rule; *target++ = c; continue; /* } */ case U('}'): if (ctx->in_rule) --ctx->in_rule; *target++ = c; continue; /* space starting with comment */ case U('/'): (void)copy_space_comment(&source, &target, ctx, NEED_SPACE_MAYBE); continue; /* Fallback: copy character. Better safe than sorry. Should not be * reached, though */ default: *target++ = c; continue; } } return (Py_ssize_t)(target - tstart) + (Py_ssize_t)(ctx->sentinel - source); }
bool parse_url(char const* input, url_t* url, url_t const* base, bool* errors) { enum state_t { state_scheme_start, state_scheme, state_no_scheme, state_relative_or_authority, state_path_or_authority, state_relative, state_relative_slash, state_authority_slashes, state_authority_ignore_slashes, state_authority, state_host, state_port, state_file, state_file_slash, state_file_host, state_path_start, state_path, state_non_relative_path, state_query, state_fragment, }; size_t length = strlen(input); if (!length) return false; for (size_t i = 0; i < length; ++i) { if (input[i] < 0x00 || input[i] > 0x7E) { // no unicode support return false; } if (input[i] == 0x0D || input[i] == 0x0A || input[i] == 0x09) { // no tab or newline return false; } } bool s_errors = false; if (errors) { *errors = false; } else { errors = &s_errors; } if (input[0] <= 0x20 || input[length - 1] <= 0x20) { // leading or trailing C0-controls and space *errors = true; while (*input <= 0x20) { ++input; --length; } while (length && input[length - 1] <= 0x20) { --length; } } state_t state = state_scheme_start; reset_url(url); bool at_flag = false; bool bracket_flag = false; std::string buffer; char const* ptr = input; while (true) { switch (state) { case state_scheme_start: if (std::isalpha(*ptr)) { buffer.push_back(std::tolower(*ptr)); state = state_scheme; } else { state = state_no_scheme; --ptr; } break; case state_scheme: if (std::isalnum(*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.') { buffer.push_back(std::tolower(*ptr)); } else if (*ptr == ':') { url->scheme = buffer; url->is_special = (url->scheme == "ftp" || url->scheme == "gopher" || url->scheme == "http" || url->scheme == "https" || url->scheme == "ws" || url->scheme == "wss" || url->scheme == "file"); buffer.clear(); if (url->scheme == "file") { if (ptr[1] != '/' || ptr[2] != '/') { *errors = true; } state = state_file; } else if (url->is_special) { if (base && base->scheme == url->scheme) { state = state_relative_or_authority; } else { state = state_authority_slashes; } } else if (ptr[1] == '/') { state = state_path_or_authority; ++ptr; } else { url->non_relative = true; url->path.push_back(""); state = state_non_relative_path; } } else { buffer.clear(); ptr = input - 1; state = state_no_scheme; } break; case state_no_scheme: if (!base || (base->non_relative && *ptr != '#')) { return false; } else if (base->non_relative && *ptr == '#') { copy_url(url, base, c_scheme | c_path | c_query); url->non_relative = true; state = state_fragment; } else if (base->scheme != "file") { state = state_relative; --ptr; } else { state = state_file; --ptr; } break; case state_relative_or_authority: if (ptr[0] == '/' && ptr[1] == '/') { state = state_authority_ignore_slashes; ++ptr; } else { *errors = true; state = state_relative; --ptr; } break; case state_path_or_authority: if (*ptr == '/') { state = state_authority; } else { state = state_path; --ptr; } break; case state_relative: copy_url(url, base, c_scheme); switch (*ptr) { case 0: copy_url(url, base, c_username | c_password | c_host | c_port | c_path | c_query); break; case '/': state = state_relative_slash; break; case '?': copy_url(url, base, c_username | c_password | c_host | c_port | c_path); state = state_query; break; case '#': copy_url(url, base, c_username | c_password | c_host | c_port | c_path | c_query); state = state_fragment; break; default: if (url->is_special && *ptr == '\\') { *errors = true; state = state_relative_slash; } else { copy_url(url, base, c_username | c_password | c_host | c_port | c_path); if (url->path.size()) { url->path.pop_back(); } state = state_path; --ptr; } } break; case state_relative_slash: if (*ptr == '/' || (url->is_special && *ptr == '\\')) { if (*ptr == '\\') { *errors = true; } state = state_authority_ignore_slashes; } else { copy_url(url, base, c_username | c_password | c_host | c_port); state = state_path; --ptr; } break; case state_authority_slashes: if (ptr[0] == '/' && ptr[1] == '/') { state = state_authority_ignore_slashes; ++ptr; } else { *errors = true; state = state_authority_ignore_slashes; --ptr; } break; case state_authority_ignore_slashes: if (*ptr != '/' && *ptr != '\\') { state = state_authority; --ptr; } else { *errors = true; } break; case state_authority: if (*ptr == '@') { *errors = true; if (at_flag) { buffer = "%40" + buffer; } at_flag = true; bool password = false; for (char chr : buffer) { if (chr == ':') { password = true; } else if (password) { percent_encode<encode_userinfo>(url->password, chr); } else { percent_encode<encode_userinfo>(url->username, chr); } } buffer.clear(); } else if (*ptr == 0 || *ptr == '/' || *ptr == '?' || *ptr == '#' || (url->is_special && *ptr == '\\')) { ptr -= buffer.size() + 1; buffer.clear(); state = state_host; } else { buffer.push_back(*ptr); } break; case state_host: if (*ptr == ':' && !bracket_flag) { if (url->is_special && buffer.empty()) { return false; } if (!parse_host(buffer, url, errors)) { return false; } buffer.clear(); state = state_port; } else if (*ptr == 0 || *ptr == '/' || *ptr == '?' || *ptr == '#' || (url->is_special && *ptr == '\\')) { --ptr; if (url->is_special && buffer.empty()) { return false; } if (!parse_host(buffer, url, errors)) { return false; } buffer.clear(); state = state_path_start; } else { if (*ptr == '[') { bracket_flag = true; } else if (*ptr == ']') { bracket_flag = false; } buffer.push_back(*ptr); } break; case state_port: if (std::isdigit(*ptr)) { buffer.push_back(*ptr); } else if (*ptr == 0 || *ptr == '/' || *ptr == '?' || *ptr == '#' || (url->is_special && *ptr == '\\')) { if (buffer.size() > 5) return false; int length, port; if (sscanf(buffer.c_str(), "%d%n", &port, &length) != 1 || static_cast<size_t>(length) != buffer.size() || port > 65535) { return false; } url->port = (port == scheme_port(url->scheme.c_str()) ? 0 : port); buffer.clear(); state = state_path_start; --ptr; } else { return false; } break; case state_file: url->scheme = "file"; url->is_special = true; switch (*ptr) { case 0: if (base && base->scheme == "file") { copy_url(url, base, c_host | c_path | c_query); } break; case '\\': *errors = true; // fall through case '/': state = state_file_slash; break; case '?': if (base && base->scheme == "file") { copy_url(url, base, c_host | c_path); } state = state_query; break; case '#': if (base && base->scheme == "file") { copy_url(url, base, c_host | c_path | c_query); } state = state_fragment; break; default: if (base && base->scheme == "file" && (!std::isalpha(ptr[0]) || (ptr[1] != ':' && ptr[1] != '|') || ptr[2] == 0 || (ptr[2] != '/' && ptr[2] != '\\' && ptr[2] != '?' && ptr[2] != '#'))) { copy_url(url, base, c_host | c_path); pop_path(url->path); } else if (base && base->scheme == "file") { return false; } state = state_path; --ptr; } break; case state_file_slash: if (*ptr == '/' || *ptr == '\\') { if (*ptr == '\\') *errors = true; state = state_file_host; } else { if (base && base->scheme == "file" && base->path.size() && is_normalized_drive(base->path[0])) { url->path.push_back(base->path[0]); } state = state_path; --ptr; } break; case state_file_host: if (*ptr == 0 || *ptr == '/' || *ptr == '\\' || *ptr == '?' || *ptr == '#') { --ptr; if (is_drive(buffer)) { *errors = true; state = state_path; } else if (buffer.empty()) { state = state_path_start; } else { if (!parse_host(buffer, url, errors)) { return false; } if (url->host == "localhost") { url->host.clear(); } buffer.clear(); state = state_path_start; } } else { buffer.push_back(*ptr); } break; case state_path_start: if (url->is_special && *ptr == '\\') { *errors = true; } state = state_path; if (*ptr != '/' && (!url->is_special || *ptr != '\\')) { --ptr; } break; case state_path: if (*ptr == 0 || *ptr == '/' || (url->is_special && *ptr == '\\') || *ptr == '?' || *ptr == '#') { if (url->is_special && *ptr == '\\') { *errors = true; } if (buffer == "..") { pop_path(url->path); if (*ptr != '/' && (!url->is_special || *ptr != '\\')) { url->path.push_back(""); } } else if (buffer == "." && *ptr != '/' && (!url->is_special || *ptr != '\\')) { url->path.push_back(""); } else if (buffer != ".") { if (url->scheme == "file" && url->path.empty() && is_drive(buffer)) { if (url->host.size()) *errors = true; url->host.clear(); buffer[1] = ':'; } url->path.push_back(buffer); } buffer.clear(); if (*ptr == '?') { state = state_query; } else if (*ptr == '#') { state = state_fragment; } } else { if (!url_code_point(*ptr) && *ptr != '%') { *errors = true; } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) { *errors = true; } if (*ptr == '%' && ptr[1] == '2' && (ptr[2] == 'e' || ptr[2] == 'E')) { buffer.push_back('.'); ptr += 2; } else { percent_encode<encode_default>(buffer, *ptr); } } break; case state_non_relative_path: if (*ptr == '?') { state = state_query; } else if (*ptr == '#') { state = state_fragment; } else { if (*ptr != 0 && !url_code_point(*ptr) && *ptr != '%') { *errors = true; } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) { *errors = true; } if (*ptr) { percent_encode<encode_simple>(url->path[0], *ptr); } } break; case state_query: if (*ptr == '#') { state = state_fragment; } else { if (*ptr != 0 && !url_code_point(*ptr) && *ptr != '%') { *errors = true; } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) { *errors = true; } if (*ptr) { percent_encode<encode_query>(url->query, *ptr); } } break; case state_fragment: if (*ptr != 0 && !url_code_point(*ptr) && *ptr != '%') { *errors = true; } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) { *errors = true; } if (*ptr) { url->query.push_back(*ptr); } break; } if (ptr < input + length) { ++ptr; } else { break; } } return true; }