Exemple #1
0
/* convert_links() is called from recursive_retrieve() after we're
   done with an HTML file.  This call to convert_links is not complete
   because it converts only the downloaded files, and Wget cannot know
   which files will be downloaded afterwards.  So, if we have file
   fileone.html with:

   <a href="/c/something.gif">

   and /c/something.gif was not downloaded because it exceeded the
   recursion depth, the reference will *not* be changed.

   However, later we can encounter /c/something.gif from an "upper"
   level HTML (let's call it filetwo.html), and it gets downloaded.

   But now we have a problem because /c/something.gif will be
   correctly transformed in filetwo.html, but not in fileone.html,
   since Wget could not have known that /c/something.gif will be
   downloaded in the future.

   This is why Wget must, after the whole retrieval, call
   convert_all_links to go once more through the entire list of
   retrieved HTMLs, and re-convert them.

   All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
   in urls_downloaded.  From these two lists information is
   extracted.  */
void
convert_all_links (void)
{
  slist *html;

  /* Destructively reverse downloaded_html_files to get it in the right order.
     recursive_retrieve() used slist_prepend() consistently.  */
  downloaded_html_files = slist_nreverse (downloaded_html_files);

  for (html = downloaded_html_files; html; html = html->next)
    {
      urlpos *urls, *cur_url;
      char *url;

      DEBUGP (("Rescanning %s\n", html->string));
      /* Determine the URL of the HTML file.  get_urls_html will need
	 it.  */
      url = hash_table_get (dl_file_url_map, html->string);
      if (url)
	DEBUGP (("It should correspond to %s.\n", url));
      else
	DEBUGP (("I cannot find the corresponding URL.\n"));
      /* Parse the HTML file...  */
      urls = get_urls_html (html->string, url, FALSE, NULL);
      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
         links that have been followed from other files.  */
      for (cur_url = urls; cur_url; cur_url = cur_url->next)
	{
	  char *local_name;

	  /* The URL must be in canonical form to be compared.  */
	  struct urlinfo *u = newurl ();
	  uerr_t res = parseurl (cur_url->url, u, 0);
	  if (res != URLOK)
	    {
	      freeurl (u, 1);
	      continue;
	    }
	  /* We decide the direction of conversion according to whether
	     a URL was downloaded.  Downloaded URLs will be converted
	     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
	  local_name = hash_table_get (dl_url_file_map, u->url);
	  if (local_name)
	    DEBUGP (("%s marked for conversion, local %s\n",
		     u->url, local_name));
	  /* Decide on the conversion direction.  */
	  if (local_name)
	    {
	      /* We've downloaded this URL.  Convert it to relative
                 form.  We do this even if the URL already is in
                 relative form, because our directory structure may
                 not be identical to that on the server (think `-nd',
                 `--cut-dirs', etc.)  */
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (local_name);
	    }
	  else
	    {
	      /* We haven't downloaded this URL.  If it's not already
                 complete (including a full host name), convert it to
                 that form, so it can be reached while browsing this
                 HTML locally.  */
	      if (!cur_url->link_complete_p)
		cur_url->convert = CO_CONVERT_TO_COMPLETE;
	      cur_url->local_name = NULL;
	    }
	  freeurl (u, 1);
	}
      /* Convert the links in the file.  */
      convert_links (html->string, urls);
      /* Free the data.  */
      free_urlpos (urls);
    }
}
void
convert_all_links (void)
{
  slist *html;
  long msecs;
  int file_count = 0;

  struct wget_timer *timer = wtimer_new ();

  /* Destructively reverse downloaded_html_files to get it in the right order.
     recursive_retrieve() used slist_prepend() consistently.  */
  downloaded_html_list = slist_nreverse (downloaded_html_list);

  for (html = downloaded_html_list; html; html = html->next)
    {
      struct urlpos *urls, *cur_url;
      char *url;
      char *file = html->string;

      /* Determine the URL of the HTML file.  get_urls_html will need
	 it.  */
      url = hash_table_get (dl_file_url_map, file);
      if (!url)
	{
	  DEBUGP (("Apparently %s has been removed.\n", file));
	  continue;
	}

      DEBUGP (("Scanning %s (from %s)\n", file, url));

      /* Parse the HTML file...  */
      urls = get_urls_html (file, url, NULL);

      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
         links that have been followed from other files.  */

      for (cur_url = urls; cur_url; cur_url = cur_url->next)
	{
	  char *local_name;
	  struct url *u = cur_url->url;

	  if (cur_url->link_base_p)
	    {
	      /* Base references have been resolved by our parser, so
		 we turn the base URL into an empty string.  (Perhaps
		 we should remove the tag entirely?)  */
	      cur_url->convert = CO_NULLIFY_BASE;
	      continue;
	    }

	  /* We decide the direction of conversion according to whether
	     a URL was downloaded.  Downloaded URLs will be converted
	     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
	  local_name = hash_table_get (dl_url_file_map, u->url);

	  /* Decide on the conversion type.  */
	  if (local_name)
	    {
	      /* We've downloaded this URL.  Convert it to relative
                 form.  We do this even if the URL already is in
                 relative form, because our directory structure may
                 not be identical to that on the server (think `-nd',
                 `--cut-dirs', etc.)  */
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (local_name);
	      DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
	    }
	  else
	    {
	      /* We haven't downloaded this URL.  If it's not already
                 complete (including a full host name), convert it to
                 that form, so it can be reached while browsing this
                 HTML locally.  */
	      if (!cur_url->link_complete_p)
		cur_url->convert = CO_CONVERT_TO_COMPLETE;
	      cur_url->local_name = NULL;
	      DEBUGP (("will convert url %s to complete\n", u->url));
	    }
	}

      /* Convert the links in the file.  */
      convert_links (file, urls);
      ++file_count;

      /* Free the data.  */
      free_urlpos (urls);
    }

  msecs = wtimer_elapsed (timer);
  wtimer_delete (timer);
  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
	     file_count, (double)msecs / 1000);
}