Example #1
0
/*
 * Main function
 *
 * The return value determines the result length (kept in the target buffer).
 * However, if the target buffer is too small, the return value is greater
 * than tlength. The difference to tlength is the number of unconsumed source
 * characters at the time the buffer was full. In this case you should resize
 * the target buffer to the return value and call rcssmin again. Repeat as
 * often as needed.
 */
static Py_ssize_t
rcssmin(const rchar *source, rchar *target, Py_ssize_t slength,
        Py_ssize_t tlength, int keep_bang_comments)
{
    rcssmin_ctx_t ctx_, *ctx = &ctx_;
    const rchar *tstart = target;
    rchar c;

    ctx->start = source;
    ctx->sentinel = source + slength;
    ctx->tsentinel = target + tlength;
    ctx->at_group = 0;
    ctx->in_macie5 = 0;
    ctx->in_rule = 0;
    ctx->keep_bang_comments = keep_bang_comments;

    while (source < ctx->sentinel && target < ctx->tsentinel) {
        c = *source++;
        if (RCSSMIN_IS_DULL(c)) {
            *target++ = c;
            continue;
        }
        else if (RCSSMIN_IS_SPACE(c)) {
            copy_space(&source, &target, ctx, NEED_SPACE_MAYBE);
            continue;
        }

        switch (c) {

        /* Escape */
        case U('\\'):
            copy_escape(&source, &target, ctx);
            continue;

        /* String */
        case U('"'): case U('\''):
            copy_string(&source, &target, ctx);
            continue;

        /* URL */
        case U('u'):
            copy_url(&source, &target, ctx);
            continue;

        /* IE7hack */
        case U('>'):
            copy_ie7hack(&source, &target, ctx);
            continue;

        /* @-group */
        case U('@'):
            copy_at_group(&source, &target, ctx);
            continue;

        /* ; */
        case U(';'):
            copy_semicolon(&source, &target, ctx);
            continue;

        /* :first-line|letter followed by [{,] */
        /* (apparently needed for IE6) */
        case U(':'):
            copy_first(&source, &target, ctx);
            continue;

        /* { */
        case U('{'):
            if (ctx->at_group)
                --ctx->at_group;
            else
                ++ctx->in_rule;
            *target++ = c;
            continue;

        /* } */
        case U('}'):
            if (ctx->in_rule)
                --ctx->in_rule;
            *target++ = c;
            continue;

        /* space starting with comment */
        case U('/'):
            (void)copy_space_comment(&source, &target, ctx, NEED_SPACE_MAYBE);
            continue;

        /* Fallback: copy character. Better safe than sorry. Should not be
         * reached, though */
        default:
            *target++ = c;
            continue;
        }
    }

    return
        (Py_ssize_t)(target - tstart) + (Py_ssize_t)(ctx->sentinel - source);
}
Example #2
0
bool parse_url(char const* input, url_t* url, url_t const* base, bool* errors) {
  enum state_t {
    state_scheme_start,
    state_scheme,
    state_no_scheme,
    state_relative_or_authority,
    state_path_or_authority,
    state_relative,
    state_relative_slash,
    state_authority_slashes,
    state_authority_ignore_slashes,
    state_authority,
    state_host,
    state_port,
    state_file,
    state_file_slash,
    state_file_host,
    state_path_start,
    state_path,
    state_non_relative_path,
    state_query,
    state_fragment,
  };

  size_t length = strlen(input);
  if (!length) return false;
  for (size_t i = 0; i < length; ++i) {
    if (input[i] < 0x00 || input[i] > 0x7E) {
      // no unicode support
      return false;
    }
    if (input[i] == 0x0D || input[i] == 0x0A || input[i] == 0x09) {
      // no tab or newline
      return false;
    }
  }

  bool s_errors = false;
  if (errors) {
    *errors = false;
  } else {
    errors = &s_errors;
  }

  if (input[0] <= 0x20 || input[length - 1] <= 0x20) {
    // leading or trailing C0-controls and space
    *errors = true;
    while (*input <= 0x20) {
      ++input;
      --length;
    }
    while (length && input[length - 1] <= 0x20) {
      --length;
    }
  }

  state_t state = state_scheme_start;
  reset_url(url);

  bool at_flag = false;
  bool bracket_flag = false;

  std::string buffer;
  char const* ptr = input;
  while (true) {
    switch (state) {
    case state_scheme_start:
      if (std::isalpha(*ptr)) {
        buffer.push_back(std::tolower(*ptr));
        state = state_scheme;
      } else {
        state = state_no_scheme;
        --ptr;
      }
      break;
    case state_scheme:
      if (std::isalnum(*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.') {
        buffer.push_back(std::tolower(*ptr));
      } else if (*ptr == ':') {
        url->scheme = buffer;
        url->is_special = (url->scheme == "ftp" || url->scheme == "gopher" || url->scheme == "http" ||
          url->scheme == "https" || url->scheme == "ws" || url->scheme == "wss" || url->scheme == "file");
        buffer.clear();
        if (url->scheme == "file") {
          if (ptr[1] != '/' || ptr[2] != '/') {
            *errors = true;
          }
          state = state_file;
        } else if (url->is_special) {
          if (base && base->scheme == url->scheme) {
            state = state_relative_or_authority;
          } else {
            state = state_authority_slashes;
          }
        } else if (ptr[1] == '/') {
          state = state_path_or_authority;
          ++ptr;
        } else {
          url->non_relative = true;
          url->path.push_back("");
          state = state_non_relative_path;
        }
      } else {
        buffer.clear();
        ptr = input - 1;
        state = state_no_scheme;
      }
      break;
    case state_no_scheme:
      if (!base || (base->non_relative && *ptr != '#')) {
        return false;
      } else if (base->non_relative && *ptr == '#') {
        copy_url(url, base, c_scheme | c_path | c_query);
        url->non_relative = true;
        state = state_fragment;
      } else if (base->scheme != "file") {
        state = state_relative;
        --ptr;
      } else {
        state = state_file;
        --ptr;
      }
      break;
    case state_relative_or_authority:
      if (ptr[0] == '/' && ptr[1] == '/') {
        state = state_authority_ignore_slashes;
        ++ptr;
      } else {
        *errors = true;
        state = state_relative;
        --ptr;
      }
      break;
    case state_path_or_authority:
      if (*ptr == '/') {
        state = state_authority;
      } else {
        state = state_path;
        --ptr;
      }
      break;
    case state_relative:
      copy_url(url, base, c_scheme);
      switch (*ptr) {
      case 0:
        copy_url(url, base, c_username | c_password | c_host | c_port | c_path | c_query);
        break;
      case '/':
        state = state_relative_slash;
        break;
      case '?':
        copy_url(url, base, c_username | c_password | c_host | c_port | c_path);
        state = state_query;
        break;
      case '#':
        copy_url(url, base, c_username | c_password | c_host | c_port | c_path | c_query);
        state = state_fragment;
        break;
      default:
        if (url->is_special && *ptr == '\\') {
          *errors = true;
          state = state_relative_slash;
        } else {
          copy_url(url, base, c_username | c_password | c_host | c_port | c_path);
          if (url->path.size()) {
            url->path.pop_back();
          }
          state = state_path;
          --ptr;
        }
      }
      break;
    case state_relative_slash:
      if (*ptr == '/' || (url->is_special && *ptr == '\\')) {
        if (*ptr == '\\') {
          *errors = true;
        }
        state = state_authority_ignore_slashes;
      } else {
        copy_url(url, base, c_username | c_password | c_host | c_port);
        state = state_path;
        --ptr;
      }
      break;
    case state_authority_slashes:
      if (ptr[0] == '/' && ptr[1] == '/') {
        state = state_authority_ignore_slashes;
        ++ptr;
      } else {
        *errors = true;
        state = state_authority_ignore_slashes;
        --ptr;
      }
      break;
    case state_authority_ignore_slashes:
      if (*ptr != '/' && *ptr != '\\') {
        state = state_authority;
        --ptr;
      } else {
        *errors = true;
      }
      break;
    case state_authority:
      if (*ptr == '@') {
        *errors = true;
        if (at_flag) {
          buffer = "%40" + buffer;
        }
        at_flag = true;
        bool password = false;
        for (char chr : buffer) {
          if (chr == ':') {
            password = true;
          } else if (password) {
            percent_encode<encode_userinfo>(url->password, chr);
          } else {
            percent_encode<encode_userinfo>(url->username, chr);
          }
        }
        buffer.clear();
      } else if (*ptr == 0 || *ptr == '/' || *ptr == '?' || *ptr == '#' || (url->is_special && *ptr == '\\')) {
        ptr -= buffer.size() + 1;
        buffer.clear();
        state = state_host;
      } else {
        buffer.push_back(*ptr);
      }
      break;
    case state_host:
      if (*ptr == ':' && !bracket_flag) {
        if (url->is_special && buffer.empty()) {
          return false;
        }
        if (!parse_host(buffer, url, errors)) {
          return false;
        }
        buffer.clear();
        state = state_port;
      } else if (*ptr == 0 || *ptr == '/' || *ptr == '?' || *ptr == '#' || (url->is_special && *ptr == '\\')) {
        --ptr;
        if (url->is_special && buffer.empty()) {
          return false;
        }
        if (!parse_host(buffer, url, errors)) {
          return false;
        }
        buffer.clear();
        state = state_path_start;
      } else {
        if (*ptr == '[') {
          bracket_flag = true;
        } else if (*ptr == ']') {
          bracket_flag = false;
        }
        buffer.push_back(*ptr);
      }
      break;
    case state_port:
      if (std::isdigit(*ptr)) {
        buffer.push_back(*ptr);
      } else if (*ptr == 0 || *ptr == '/' || *ptr == '?' || *ptr == '#' || (url->is_special && *ptr == '\\')) {
        if (buffer.size() > 5) return false;
        int length, port;
        if (sscanf(buffer.c_str(), "%d%n", &port, &length) != 1 || static_cast<size_t>(length) != buffer.size() || port > 65535) {
          return false;
        }
        url->port = (port == scheme_port(url->scheme.c_str()) ? 0 : port);
        buffer.clear();
        state = state_path_start;
        --ptr;
      } else {
        return false;
      }
      break;
    case state_file:
      url->scheme = "file";
      url->is_special = true;
      switch (*ptr) {
      case 0:
        if (base && base->scheme == "file") {
          copy_url(url, base, c_host | c_path | c_query);
        }
        break;
      case '\\':
        *errors = true;
        // fall through
      case '/':
        state = state_file_slash;
        break;
      case '?':
        if (base && base->scheme == "file") {
          copy_url(url, base, c_host | c_path);
        }
        state = state_query;
        break;
      case '#':
        if (base && base->scheme == "file") {
          copy_url(url, base, c_host | c_path | c_query);
        }
        state = state_fragment;
        break;
      default:
        if (base && base->scheme == "file" && (!std::isalpha(ptr[0]) || (ptr[1] != ':' && ptr[1] != '|') ||
          ptr[2] == 0 || (ptr[2] != '/' && ptr[2] != '\\' && ptr[2] != '?' && ptr[2] != '#')))
        {
          copy_url(url, base, c_host | c_path);
          pop_path(url->path);
        } else if (base && base->scheme == "file") {
          return false;
        }
        state = state_path;
        --ptr;
      }
      break;
    case state_file_slash:
      if (*ptr == '/' || *ptr == '\\') {
        if (*ptr == '\\') *errors = true;
        state = state_file_host;
      } else {
        if (base && base->scheme == "file" && base->path.size() && is_normalized_drive(base->path[0])) {
          url->path.push_back(base->path[0]);
        }
        state = state_path;
        --ptr;
      }
      break;
    case state_file_host:
      if (*ptr == 0 || *ptr == '/' || *ptr == '\\' || *ptr == '?' || *ptr == '#') {
        --ptr;
        if (is_drive(buffer)) {
          *errors = true;
          state = state_path;
        } else if (buffer.empty()) {
          state = state_path_start;
        } else {
          if (!parse_host(buffer, url, errors)) {
            return false;
          }
          if (url->host == "localhost") {
            url->host.clear();
          }
          buffer.clear();
          state = state_path_start;
        }
      } else {
        buffer.push_back(*ptr);
      }
      break;
    case state_path_start:
      if (url->is_special && *ptr == '\\') {
        *errors = true;
      }
      state = state_path;
      if (*ptr != '/' && (!url->is_special || *ptr != '\\')) {
        --ptr;
      }
      break;
    case state_path:
      if (*ptr == 0 || *ptr == '/' || (url->is_special && *ptr == '\\') || *ptr == '?' || *ptr == '#') {
        if (url->is_special && *ptr == '\\') {
          *errors = true;
        }
        if (buffer == "..") {
          pop_path(url->path);
          if (*ptr != '/' && (!url->is_special || *ptr != '\\')) {
            url->path.push_back("");
          }
        } else if (buffer == "." && *ptr != '/' && (!url->is_special || *ptr != '\\')) {
          url->path.push_back("");
        } else if (buffer != ".") {
          if (url->scheme == "file" && url->path.empty() && is_drive(buffer)) {
            if (url->host.size()) *errors = true;
            url->host.clear();
            buffer[1] = ':';
          }
          url->path.push_back(buffer);
        }
        buffer.clear();
        if (*ptr == '?') {
          state = state_query;
        } else if (*ptr == '#') {
          state = state_fragment;
        }
      } else {
        if (!url_code_point(*ptr) && *ptr != '%') {
          *errors = true;
        } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) {
          *errors = true;
        }
        if (*ptr == '%' && ptr[1] == '2' && (ptr[2] == 'e' || ptr[2] == 'E')) {
          buffer.push_back('.');
          ptr += 2;
        } else {
          percent_encode<encode_default>(buffer, *ptr);
        }
      }
      break;
    case state_non_relative_path:
      if (*ptr == '?') {
        state = state_query;
      } else if (*ptr == '#') {
        state = state_fragment;
      } else {
        if (*ptr != 0 && !url_code_point(*ptr) && *ptr != '%') {
          *errors = true;
        } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) {
          *errors = true;
        }
        if (*ptr) {
          percent_encode<encode_simple>(url->path[0], *ptr);
        }
      }
      break;
    case state_query:
      if (*ptr == '#') {
        state = state_fragment;
      } else {
        if (*ptr != 0 && !url_code_point(*ptr) && *ptr != '%') {
          *errors = true;
        } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) {
          *errors = true;
        }
        if (*ptr) {
          percent_encode<encode_query>(url->query, *ptr);
        }
      }
      break;
    case state_fragment:
      if (*ptr != 0 && !url_code_point(*ptr) && *ptr != '%') {
        *errors = true;
      } else if (*ptr == '%' && (!std::isxdigit(ptr[1]) || !std::isxdigit(ptr[2]))) {
        *errors = true;
      }
      if (*ptr) {
        url->query.push_back(*ptr);
      }
      break;
    }

    if (ptr < input + length) {
      ++ptr;
    } else {
      break;
    }
  }

  return true;
}