struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
               struct iri *iri)
{
  struct file_memory *fm;
  struct map_context ctx;
  int flags;

  /* Load the file. */
  fm = wget_read_file (file);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));

  ctx.text = fm->content;
  ctx.head = NULL;
  ctx.base = NULL;
  ctx.parent_base = url ? url : opt.base_href;
  ctx.document_file = file;
  ctx.nofollow = false;

  if (!interesting_tags)
    init_interesting ();

  /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
     generate <a href=" foo"> instead of <a href="foo"> (browsers
     ignore spaces as well.)  If you really mean space, use &32; or
     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
     e.g. in <img src="foo.[newline]html">.  Such newlines are also
     ignored by IE and Mozilla and are presumably introduced by
     writing HTML with editors that force word wrap.  */
  flags = MHT_TRIM_VALUES;
  if (opt.strict_comments)
    flags |= MHT_STRICT_COMMENTS;

  /* the NULL here used to be interesting_tags */
  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                 NULL, interesting_attributes);

  /* If meta charset isn't null, override content encoding */
  if (iri && meta_charset)
    set_content_encoding (iri, meta_charset);

  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
  if (meta_disallow_follow)
    *meta_disallow_follow = ctx.nofollow;

  xfree_null (ctx.base);
  wget_read_file_free (fm);
  return ctx.head;
}
Exemple #2
0
uerr_t
retrieve_from_file (const char *file, bool html, int *count)
{
  uerr_t status;
  struct urlpos *url_list, *cur_url;
  struct iri *iri = iri_new();

  char *input_file, *url_file = NULL;
  const char *url = file;

  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */

  /* sXXXav : Assume filename and links in the file are in the locale */
  set_uri_encoding (iri, opt.locale, true);
  set_content_encoding (iri, opt.locale);

  if (url_valid_scheme (url))
    {
      int dt,url_err;
      uerr_t status;
      struct url *url_parsed = url_parse (url, &url_err, iri, true);
      if (!url_parsed)
        {
          char *error = url_error (url, url_err);
          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
          xfree (error);
          return URLERROR;
        }

      if (!opt.base_href)
        opt.base_href = xstrdup (url);

      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
                             false, iri, true);
      url_free (url_parsed);

      if (!url_file || (status != RETROK))
        return status;

      if (dt & TEXTHTML)
        html = true;

      /* If we have a found a content encoding, use it.
       * ( == is okay, because we're checking for identical object) */
      if (iri->content_encoding != opt.locale)
	  set_uri_encoding (iri, iri->content_encoding, false);

      /* Reset UTF-8 encode status */
      iri->utf8_encode = opt.enable_iri;
      xfree_null (iri->orig_url);
      iri->orig_url = NULL;

      input_file = url_file;
    }
  else
    input_file = (char *) file;

  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
              : get_urls_file (input_file));

  xfree_null (url_file);

  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
    {
      char *filename = NULL, *new_file = NULL;
      int dt;
      struct iri *tmpiri = iri_dup (iri);
      struct url *parsed_url = NULL;

      if (cur_url->ignore_when_downloading)
        continue;

      if (opt.quota && total_downloaded_bytes > opt.quota)
        {
          status = QUOTEXC;
          break;
        }

      parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);

      if ((opt.recursive || opt.page_requisites)
          && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
        {
          int old_follow_ftp = opt.follow_ftp;

          /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
          if (cur_url->url->scheme == SCHEME_FTP)
            opt.follow_ftp = 1;

          status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
                                  tmpiri);

          opt.follow_ftp = old_follow_ftp;
        }
      else
        status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
                               cur_url->url->url, &filename,
                               &new_file, NULL, &dt, opt.recursive, tmpiri,
                               true);

      if (parsed_url)
          url_free (parsed_url);

      if (filename && opt.delete_after && file_exists_p (filename))
        {
          DEBUGP (("\
Removing file due to --delete-after in retrieve_from_file():\n"));
          logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
          if (unlink (filename))
            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
          dt &= ~RETROKF;
        }

      xfree_null (new_file);
      xfree_null (filename);
      iri_free (tmpiri);
    }
Exemple #3
0
uerr_t
retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
              char **newloc, const char *refurl, int *dt, bool recursive,
              struct iri *iri, bool register_status)
{
  uerr_t result;
  char *url;
  bool location_changed;
  bool iri_fallbacked = 0;
  int dummy;
  char *mynewloc, *proxy;
  struct url *u = orig_parsed, *proxy_url;
  int up_error_code;            /* url parse error code */
  char *local_file;
  int redirection_count = 0;

  bool post_data_suspended = false;
  char *saved_post_data = NULL;
  char *saved_post_file_name = NULL;

  /* If dt is NULL, use local storage.  */
  if (!dt)
    {
      dt = &dummy;
      dummy = 0;
    }
  url = xstrdup (origurl);
  if (newloc)
    *newloc = NULL;
  if (file)
    *file = NULL;

  if (!refurl)
    refurl = opt.referer;

 redirected:
  /* (also for IRI fallbacking) */

  result = NOCONERROR;
  mynewloc = NULL;
  local_file = NULL;
  proxy_url = NULL;

  proxy = getproxy (u);
  if (proxy)
    {
      struct iri *pi = iri_new ();
      set_uri_encoding (pi, opt.locale, true);
      pi->utf8_encode = false;

      /* Parse the proxy URL.  */
      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
      if (!proxy_url)
        {
          char *error = url_error (proxy, up_error_code);
          logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
                     proxy, error);
          xfree (url);
          xfree (error);
          RESTORE_POST_DATA;
          result = PROXERR;
          goto bail;
        }
      if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
        {
          logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
          url_free (proxy_url);
          xfree (url);
          RESTORE_POST_DATA;
          result = PROXERR;
          goto bail;
        }
    }

  if (u->scheme == SCHEME_HTTP
#ifdef HAVE_SSL
      || u->scheme == SCHEME_HTTPS
#endif
      || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
    {
      result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
                          proxy_url, iri);
    }
  else if (u->scheme == SCHEME_FTP)
    {
      /* If this is a redirection, temporarily turn off opt.ftp_glob
         and opt.recursive, both being undesirable when following
         redirects.  */
      bool oldrec = recursive, glob = opt.ftp_glob;
      if (redirection_count)
        oldrec = glob = false;

      result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
      recursive = oldrec;

      /* There is a possibility of having HTTP being redirected to
         FTP.  In these cases we must decide whether the text is HTML
         according to the suffix.  The HTML suffixes are `.html',
         `.htm' and a few others, case-insensitive.  */
      if (redirection_count && local_file && u->scheme == SCHEME_FTP)
        {
          if (has_html_suffix_p (local_file))
            *dt |= TEXTHTML;
        }
    }

  if (proxy_url)
    {
      url_free (proxy_url);
      proxy_url = NULL;
    }

  location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
  if (location_changed)
    {
      char *construced_newloc;
      struct url *newloc_parsed;

      assert (mynewloc != NULL);

      if (local_file)
        xfree (local_file);

      /* The HTTP specs only allow absolute URLs to appear in
         redirects, but a ton of boneheaded webservers and CGIs out
         there break the rules and use relative URLs, and popular
         browsers are lenient about this, so wget should be too. */
      construced_newloc = uri_merge (url, mynewloc);
      xfree (mynewloc);
      mynewloc = construced_newloc;

      /* Reset UTF-8 encoding state, keep the URI encoding and reset
         the content encoding. */
      iri->utf8_encode = opt.enable_iri;
      set_content_encoding (iri, NULL);
      xfree_null (iri->orig_url);
      iri->orig_url = NULL;

      /* Now, see if this new location makes sense. */
      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
      if (!newloc_parsed)
        {
          char *error = url_error (mynewloc, up_error_code);
          logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
                     error);
          if (orig_parsed != u)
            {
              url_free (u);
            }
          xfree (url);
          xfree (mynewloc);
          xfree (error);
          RESTORE_POST_DATA;
          goto bail;
        }

      /* Now mynewloc will become newloc_parsed->url, because if the
         Location contained relative paths like .././something, we
         don't want that propagating as url.  */
      xfree (mynewloc);
      mynewloc = xstrdup (newloc_parsed->url);

      /* Check for max. number of redirections.  */
      if (++redirection_count > opt.max_redirect)
        {
          logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
                     opt.max_redirect);
          url_free (newloc_parsed);
          if (orig_parsed != u)
            {
              url_free (u);
            }
          xfree (url);
          xfree (mynewloc);
          RESTORE_POST_DATA;
          result = WRONGCODE;
          goto bail;
        }

      xfree (url);
      url = mynewloc;
      if (orig_parsed != u)
        {
          url_free (u);
        }
      u = newloc_parsed;

      /* If we're being redirected from POST, and we received a
         redirect code different than 307, we don't want to POST
         again.  Many requests answer POST with a redirection to an
         index page; that redirection is clearly a GET.  We "suspend"
         POST data for the duration of the redirections, and restore
         it when we're done.
	 
	 RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
	 specifically to preserve the method of the request.
	 */
      if (result != NEWLOCATION_KEEP_POST && !post_data_suspended)
        SUSPEND_POST_DATA;

      goto redirected;
    }

  /* Try to not encode in UTF-8 if fetching failed */
  if (!(*dt & RETROKF) && iri->utf8_encode)
    {
      iri->utf8_encode = false;
      if (orig_parsed != u)
        {
          url_free (u);
        }
      u = url_parse (origurl, NULL, iri, true);
      if (u)
        {
          DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
          url = xstrdup (u->url);
          iri_fallbacked = 1;
          goto redirected;
        }
      else
          DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
    }

  if (local_file && u && *dt & RETROKF)
    {
      register_download (u->url, local_file);

      if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
        register_redirection (origurl, u->url);

      if (*dt & TEXTHTML)
        register_html (local_file);

      if (*dt & TEXTCSS)
        register_css (local_file);
    }

  if (file)
    *file = local_file ? local_file : NULL;
  else
    xfree_null (local_file);

  if (orig_parsed != u)
    {
      url_free (u);
    }

  if (redirection_count || iri_fallbacked)
    {
      if (newloc)
        *newloc = url;
      else
        xfree (url);
    }
  else
    {
      if (newloc)
        *newloc = NULL;
      xfree (url);
    }

  RESTORE_POST_DATA;

bail:
  if (register_status)
    inform_exit_status (result);
  return result;
}