Ejemplo n.º 1
0
bool
is_robots_txt_url (const char *url)
{
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
  bool ret = are_urls_equal (url, robots_url);

  xfree (robots_url);

  return ret;
}
Ejemplo n.º 2
0
bool
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
  struct iri *i = iri_new ();
  uerr_t err;
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
  int saved_ts_val = opt.timestamping;
  int saved_sp_val = opt.spider, url_err;
  struct url * url_parsed;

  /* Copy server URI encoding for a possible IDNA transformation, no need to
     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
  set_uri_encoding (i, iri->uri_encoding, false);
  i->utf8_encode = false;

  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
  *file = NULL;
  opt.timestamping = false;
  opt.spider       = false;

  url_parsed = url_parse (robots_url, &url_err, i, true);
  if (!url_parsed)
    {
      char *error = url_error (robots_url, url_err);
      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
      xfree (error);
      err = URLERROR;
    }
  else
    {
      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
                          false, i, false);
      url_free(url_parsed);
    }

  opt.timestamping = saved_ts_val;
  opt.spider       = saved_sp_val;
  xfree (robots_url);
  iri_free (i);

  if (err != RETROK && *file != NULL)
    {
      /* If the file is not retrieved correctly, but retrieve_url
         allocated the file name, deallocate is here so that the
         caller doesn't have to worry about it.  */
      xfree (*file);
      *file = NULL;
    }
  return err == RETROK;
}
Ejemplo n.º 3
0
/* Construct and return a "transparent proxy" URL
   reflecting changes made by --adjust-extension to the file component
   (i.e., "basename") of the original URL, but leaving the "dirname"
   of the URL (protocol://hostname... portion) untouched.

   Think: populating a squid cache via a recursive wget scrape, where
   changing URLs to work locally with "file://..." is NOT desirable.

   Example:

   if
                     p = "//foo.com/bar.cgi?xyz"
   and
      link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
   then

      new_construct_func(p, link);
   will return
      "//foo.com/bar.cgi?xyz.css"

   Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
*/
static char *
convert_basename (const char *p, const struct urlpos *link)
{
  int len = link->size;
  char *url = NULL;
  char *org_basename = NULL, *local_basename = NULL;
  char *result = NULL;

  if (*p == '"' || *p == '\'')
    {
      len -= 2;
      p++;
    }

  url = xstrndup (p, len);

  org_basename = strrchr (url, '/');
  if (org_basename)
    org_basename++;
  else
    org_basename = url;

  local_basename = strrchr (link->local_name, '/');
  if (local_basename)
    local_basename++;
  else
    local_basename = url;

  /*
   * If the basenames differ, graft the adjusted basename (local_basename)
   * onto the original URL.
   */
  if (strcmp (org_basename, local_basename) == 0)
    result = url;
  else
    {
      result = uri_merge (url, local_basename);
      xfree (url);
    }

  return result;
}
Ejemplo n.º 4
0
int
res_retrieve_file (const char *url, char **file)
{
    uerr_t err;
    char *robots_url = uri_merge (url, RES_SPECS_LOCATION);

    logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
    *file = NULL;
    err = retrieve_url (robots_url, file, NULL, NULL, NULL);
    xfree (robots_url);

    if (err != RETROK && *file != NULL)
    {
        /* If the file is not retrieved correctly, but retrieve_url
        allocated the file name, deallocate is here so that the
         caller doesn't have to worry about it.  */
        xfree (*file);
        *file = NULL;
    }
    return err == RETROK;
}
Ejemplo n.º 5
0
static void
tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
{
  struct urlpos *base_urlpos;
  int attrind;
  char *newbase = find_attr (tag, "href", &attrind);
  if (!newbase)
    return;

  base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
                            ATTR_SIZE(tag,attrind), ctx);
  if (!base_urlpos)
    return;
  base_urlpos->ignore_when_downloading = 1;
  base_urlpos->link_base_p = 1;

  if (ctx->base)
    xfree (ctx->base);
  if (ctx->parent_base)
    ctx->base = uri_merge (ctx->parent_base, newbase);
  else
    ctx->base = xstrdup (newbase);
}
Ejemplo n.º 6
0
uerr_t
retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
              char **newloc, const char *refurl, int *dt, bool recursive,
              struct iri *iri, bool register_status)
{
  uerr_t result;
  char *url;
  bool location_changed;
  bool iri_fallbacked = 0;
  int dummy;
  char *mynewloc, *proxy;
  struct url *u = orig_parsed, *proxy_url;
  int up_error_code;            /* url parse error code */
  char *local_file;
  int redirection_count = 0;

  bool post_data_suspended = false;
  char *saved_post_data = NULL;
  char *saved_post_file_name = NULL;

  /* If dt is NULL, use local storage.  */
  if (!dt)
    {
      dt = &dummy;
      dummy = 0;
    }
  url = xstrdup (origurl);
  if (newloc)
    *newloc = NULL;
  if (file)
    *file = NULL;

  if (!refurl)
    refurl = opt.referer;

 redirected:
  /* (also for IRI fallbacking) */

  result = NOCONERROR;
  mynewloc = NULL;
  local_file = NULL;
  proxy_url = NULL;

  proxy = getproxy (u);
  if (proxy)
    {
      struct iri *pi = iri_new ();
      set_uri_encoding (pi, opt.locale, true);
      pi->utf8_encode = false;

      /* Parse the proxy URL.  */
      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
      if (!proxy_url)
        {
          char *error = url_error (proxy, up_error_code);
          logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
                     proxy, error);
          xfree (url);
          xfree (error);
          RESTORE_POST_DATA;
          result = PROXERR;
          goto bail;
        }
      if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
        {
          logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
          url_free (proxy_url);
          xfree (url);
          RESTORE_POST_DATA;
          result = PROXERR;
          goto bail;
        }
    }

  if (u->scheme == SCHEME_HTTP
#ifdef HAVE_SSL
      || u->scheme == SCHEME_HTTPS
#endif
      || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
    {
      result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
                          proxy_url, iri);
    }
  else if (u->scheme == SCHEME_FTP)
    {
      /* If this is a redirection, temporarily turn off opt.ftp_glob
         and opt.recursive, both being undesirable when following
         redirects.  */
      bool oldrec = recursive, glob = opt.ftp_glob;
      if (redirection_count)
        oldrec = glob = false;

      result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
      recursive = oldrec;

      /* There is a possibility of having HTTP being redirected to
         FTP.  In these cases we must decide whether the text is HTML
         according to the suffix.  The HTML suffixes are `.html',
         `.htm' and a few others, case-insensitive.  */
      if (redirection_count && local_file && u->scheme == SCHEME_FTP)
        {
          if (has_html_suffix_p (local_file))
            *dt |= TEXTHTML;
        }
    }

  if (proxy_url)
    {
      url_free (proxy_url);
      proxy_url = NULL;
    }

  location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
  if (location_changed)
    {
      char *construced_newloc;
      struct url *newloc_parsed;

      assert (mynewloc != NULL);

      if (local_file)
        xfree (local_file);

      /* The HTTP specs only allow absolute URLs to appear in
         redirects, but a ton of boneheaded webservers and CGIs out
         there break the rules and use relative URLs, and popular
         browsers are lenient about this, so wget should be too. */
      construced_newloc = uri_merge (url, mynewloc);
      xfree (mynewloc);
      mynewloc = construced_newloc;

      /* Reset UTF-8 encoding state, keep the URI encoding and reset
         the content encoding. */
      iri->utf8_encode = opt.enable_iri;
      set_content_encoding (iri, NULL);
      xfree_null (iri->orig_url);
      iri->orig_url = NULL;

      /* Now, see if this new location makes sense. */
      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
      if (!newloc_parsed)
        {
          char *error = url_error (mynewloc, up_error_code);
          logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
                     error);
          if (orig_parsed != u)
            {
              url_free (u);
            }
          xfree (url);
          xfree (mynewloc);
          xfree (error);
          RESTORE_POST_DATA;
          goto bail;
        }

      /* Now mynewloc will become newloc_parsed->url, because if the
         Location contained relative paths like .././something, we
         don't want that propagating as url.  */
      xfree (mynewloc);
      mynewloc = xstrdup (newloc_parsed->url);

      /* Check for max. number of redirections.  */
      if (++redirection_count > opt.max_redirect)
        {
          logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
                     opt.max_redirect);
          url_free (newloc_parsed);
          if (orig_parsed != u)
            {
              url_free (u);
            }
          xfree (url);
          xfree (mynewloc);
          RESTORE_POST_DATA;
          result = WRONGCODE;
          goto bail;
        }

      xfree (url);
      url = mynewloc;
      if (orig_parsed != u)
        {
          url_free (u);
        }
      u = newloc_parsed;

      /* If we're being redirected from POST, and we received a
         redirect code different than 307, we don't want to POST
         again.  Many requests answer POST with a redirection to an
         index page; that redirection is clearly a GET.  We "suspend"
         POST data for the duration of the redirections, and restore
         it when we're done.
	 
	 RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
	 specifically to preserve the method of the request.
	 */
      if (result != NEWLOCATION_KEEP_POST && !post_data_suspended)
        SUSPEND_POST_DATA;

      goto redirected;
    }

  /* Try to not encode in UTF-8 if fetching failed */
  if (!(*dt & RETROKF) && iri->utf8_encode)
    {
      iri->utf8_encode = false;
      if (orig_parsed != u)
        {
          url_free (u);
        }
      u = url_parse (origurl, NULL, iri, true);
      if (u)
        {
          DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
          url = xstrdup (u->url);
          iri_fallbacked = 1;
          goto redirected;
        }
      else
          DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
    }

  if (local_file && u && *dt & RETROKF)
    {
      register_download (u->url, local_file);

      if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
        register_redirection (origurl, u->url);

      if (*dt & TEXTHTML)
        register_html (local_file);

      if (*dt & TEXTCSS)
        register_css (local_file);
    }

  if (file)
    *file = local_file ? local_file : NULL;
  else
    xfree_null (local_file);

  if (orig_parsed != u)
    {
      url_free (u);
    }

  if (redirection_count || iri_fallbacked)
    {
      if (newloc)
        *newloc = url;
      else
        xfree (url);
    }
  else
    {
      if (newloc)
        *newloc = NULL;
      xfree (url);
    }

  RESTORE_POST_DATA;

bail:
  if (register_status)
    inform_exit_status (result);
  return result;
}
Ejemplo n.º 7
0
struct urlpos *
get_urls_file (const char *file)
{
  struct file_memory *fm;
  struct urlpos *head, *tail;
  const char *text, *text_end;

  /* Load the file.  */
  fm = wget_read_file (file);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));

  head = tail = NULL;
  text = fm->content;
  text_end = fm->content + fm->length;
  while (text < text_end)
    {
      int up_error_code;
      char *url_text;
      struct urlpos *entry;
      struct url *url;

      const char *line_beg = text;
      const char *line_end = memchr (text, '\n', text_end - text);
      if (!line_end)
        line_end = text_end;
      else
        ++line_end;
      text = line_end;

      /* Strip whitespace from the beginning and end of line. */
      while (line_beg < line_end && c_isspace (*line_beg))
        ++line_beg;
      while (line_end > line_beg && c_isspace (*(line_end - 1)))
        --line_end;

      if (line_beg == line_end)
        continue;

      /* The URL is in the [line_beg, line_end) region. */

      /* We must copy the URL to a zero-terminated string, and we
         can't use alloca because we're in a loop.  *sigh*.  */
      url_text = strdupdelim (line_beg, line_end);

      if (opt.base_href)
        {
          /* Merge opt.base_href with URL. */
          char *merged = uri_merge (opt.base_href, url_text);
          xfree (url_text);
          url_text = merged;
        }

      char *new_url = rewrite_shorthand_url (url_text);
      if (new_url)
        {
          xfree (url_text);
          url_text = new_url;
        }

      url = url_parse (url_text, &up_error_code, NULL, false);
      if (!url)
        {
          char *error = url_error (url_text, up_error_code);
          logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
                     file, url_text, error);
          xfree (url_text);
          xfree (error);
          inform_exit_status (URLERROR);
          continue;
        }
      xfree (url_text);

      entry = xnew0 (struct urlpos);
      entry->url = url;

      if (!head)
        head = entry;
      else
        tail->next = entry;
      tail = entry;
    }
  wget_read_file_free (fm);
  return head;
}
Ejemplo n.º 8
0
struct urlpos *
append_url (const char *link_uri, int position, int size,
            struct map_context *ctx)
{
  int link_has_scheme = url_has_scheme (link_uri);
  struct urlpos *newel;
  const char *base = ctx->base ? ctx->base : ctx->parent_base;
  struct url *url;

  struct iri *iri = iri_new ();
  set_uri_encoding (iri, opt.locale, true);
  iri->utf8_encode = true;

  if (!base)
    {
      DEBUGP (("%s: no base, merge will use \"%s\".\n",
               ctx->document_file, link_uri));

      if (!link_has_scheme)
        {
          /* Base URL is unavailable, and the link does not have a
             location attached to it -- we have to give up.  Since
             this can only happen when using `--force-html -i', print
             a warning.  */
          logprintf (LOG_NOTQUIET,
                     _("%s: Cannot resolve incomplete link %s.\n"),
                     ctx->document_file, link_uri);
          return NULL;
        }

      url = url_parse (link_uri, NULL, iri, false);
      if (!url)
        {
          DEBUGP (("%s: link \"%s\" doesn't parse.\n",
                   ctx->document_file, link_uri));
          return NULL;
        }
    }
  else
    {
      /* Merge BASE with LINK_URI, but also make sure the result is
         canonicalized, i.e. that "../" have been resolved.
         (parse_url will do that for us.) */

      char *complete_uri = uri_merge (base, link_uri);

      DEBUGP (("%s: merge(%s, %s) -> %s\n",
               quotearg_n_style (0, escape_quoting_style, ctx->document_file),
               quote_n (1, base),
               quote_n (2, link_uri),
               quotearg_n_style (3, escape_quoting_style, complete_uri)));

      url = url_parse (complete_uri, NULL, iri, false);
      if (!url)
        {
          DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
                   ctx->document_file, complete_uri));
          xfree (complete_uri);
          return NULL;
        }
      xfree (complete_uri);
    }

  iri_free (iri);

  DEBUGP (("appending %s to urlpos.\n", quote (url->url)));

  newel = xnew0 (struct urlpos);
  newel->url = url;
  newel->pos = position;
  newel->size = size;

  /* A URL is relative if the host is not named, and the name does not
     start with `/'.  */
  if (!link_has_scheme && *link_uri != '/')
    newel->link_relative_p = 1;
  else if (link_has_scheme)
    newel->link_complete_p = 1;

  /* Append the new URL maintaining the order by position.  */
  if (ctx->head == NULL)
    ctx->head = newel;
  else
    {
      struct urlpos *it, *prev = NULL;

      it = ctx->head;
      while (it && position > it->pos)
        {
          prev = it;
          it = it->next;
        }

      newel->next = it;

      if (prev)
        prev->next = newel;
      else
        ctx->head = newel;
    }

  return newel;
}
Ejemplo n.º 9
0
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
              const char *refurl, int *dt, bool recursive)
{
  uerr_t result;
  char *url;
  bool location_changed;
  int dummy;
  char *mynewloc, *proxy;
  struct url *u, *proxy_url;
  int up_error_code;            /* url parse error code */
  char *local_file;
  int redirection_count = 0;

  bool post_data_suspended = false;
  char *saved_post_data = NULL;
  char *saved_post_file_name = NULL;

  /* If dt is NULL, use local storage.  */
  if (!dt)
    {
      dt = &dummy;
      dummy = 0;
    }
  url = xstrdup (origurl);
  if (newloc)
    *newloc = NULL;
  if (file)
    *file = NULL;

  u = url_parse (url, &up_error_code);
  if (!u)
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
      xfree (url);
      return URLERROR;
    }

  if (!refurl)
    refurl = opt.referer;

 redirected:

  result = NOCONERROR;
  mynewloc = NULL;
  local_file = NULL;
  proxy_url = NULL;

  proxy = getproxy (u);
  if (proxy)
    {
      /* Parse the proxy URL.  */
      proxy_url = url_parse (proxy, &up_error_code);
      if (!proxy_url)
        {
          logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
                     proxy, url_error (up_error_code));
          xfree (url);
          RESTORE_POST_DATA;
          return PROXERR;
        }
      if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
        {
          logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
          url_free (proxy_url);
          xfree (url);
          RESTORE_POST_DATA;
          return PROXERR;
        }
    }

  if (u->scheme == SCHEME_HTTP
#ifdef HAVE_SSL
      || u->scheme == SCHEME_HTTPS
#endif
      || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
    {
      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
    }
  else if (u->scheme == SCHEME_FTP)
    {
      /* If this is a redirection, temporarily turn off opt.ftp_glob
         and opt.recursive, both being undesirable when following
         redirects.  */
      bool oldrec = recursive, glob = opt.ftp_glob;
      if (redirection_count)
        oldrec = glob = false;

      result = ftp_loop (u, dt, proxy_url, recursive, glob);
      recursive = oldrec;

      /* There is a possibility of having HTTP being redirected to
         FTP.  In these cases we must decide whether the text is HTML
         according to the suffix.  The HTML suffixes are `.html',
         `.htm' and a few others, case-insensitive.  */
      if (redirection_count && local_file && u->scheme == SCHEME_FTP)
        {
          if (has_html_suffix_p (local_file))
            *dt |= TEXTHTML;
        }
    }

  if (proxy_url)
    {
      url_free (proxy_url);
      proxy_url = NULL;
    }

  location_changed = (result == NEWLOCATION);
  if (location_changed)
    {
      char *construced_newloc;
      struct url *newloc_parsed;

      assert (mynewloc != NULL);

      if (local_file)
        xfree (local_file);

      /* The HTTP specs only allow absolute URLs to appear in
         redirects, but a ton of boneheaded webservers and CGIs out
         there break the rules and use relative URLs, and popular
         browsers are lenient about this, so wget should be too. */
      construced_newloc = uri_merge (url, mynewloc);
      xfree (mynewloc);
      mynewloc = construced_newloc;

      /* Now, see if this new location makes sense. */
      newloc_parsed = url_parse (mynewloc, &up_error_code);
      if (!newloc_parsed)
        {
          logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
                     url_error (up_error_code));
          url_free (u);
          xfree (url);
          xfree (mynewloc);
          RESTORE_POST_DATA;
          return result;
        }

      /* Now mynewloc will become newloc_parsed->url, because if the
         Location contained relative paths like .././something, we
         don't want that propagating as url.  */
      xfree (mynewloc);
      mynewloc = xstrdup (newloc_parsed->url);

      /* Check for max. number of redirections.  */
      if (++redirection_count > opt.max_redirect)
        {
          logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
                     opt.max_redirect);
          url_free (newloc_parsed);
          url_free (u);
          xfree (url);
          xfree (mynewloc);
          RESTORE_POST_DATA;
          return WRONGCODE;
        }

      xfree (url);
      url = mynewloc;
      url_free (u);
      u = newloc_parsed;

      /* If we're being redirected from POST, we don't want to POST
         again.  Many requests answer POST with a redirection to an
         index page; that redirection is clearly a GET.  We "suspend"
         POST data for the duration of the redirections, and restore
         it when we're done. */
      if (!post_data_suspended)
        SUSPEND_POST_DATA;

      goto redirected;
    }

  if (local_file)
    {
      if (*dt & RETROKF)
        {
          register_download (u->url, local_file);
          if (redirection_count && 0 != strcmp (origurl, u->url))
            register_redirection (origurl, u->url);
          if (*dt & TEXTHTML)
            register_html (u->url, local_file);
        }
    }

  if (file)
    *file = local_file ? local_file : NULL;
  else
    xfree_null (local_file);

  url_free (u);

  if (redirection_count)
    {
      if (newloc)
        *newloc = url;
      else
        xfree (url);
    }
  else
    {
      if (newloc)
        *newloc = NULL;
      xfree (url);
    }

  RESTORE_POST_DATA;

  return result;
}