Exemple #1
0
char *retrieve_remote_pkg_filename (const char *pkgname, 
				    int *err)
{
    char *fname = NULL;
    char *buf = NULL;

    *err = retrieve_url(gretlhost, FUNC_FULLNAME, pkgname, NULL, 
			NULL, 0, &buf);
    
    if (!*err) {
	if (buf == NULL) {
	    *err = E_DATA;
	} else {
	    if (strstr(buf, "not found")) {
		gretl_errmsg_set(buf);
		*err = E_DATA;
	    } else {
		char tmp[64];

		sscanf(buf, "%63s", tmp);
		fname = gretl_strdup(tmp);
	    }
	    free(buf);
	}
    }

    return fname;
}
Exemple #2
0
int retrieve_remote_db (const char *dbname, 
			const char *localname,
			int opt)
{
    return retrieve_url(dbhost, opt, dbname, NULL, 
			localname, 0, NULL);
}
Exemple #3
0
int inetget_main(const int argc, const wchar_t *const argv[])
{
	//Print application info
	print_logo();

	//Initialize parameters
	Params params;

	//Load configuration file, if it exists
	const std::wstring config_file = Utils::exe_path(L".cfg");
	if(Utils::file_exists(config_file))
	{
		if(!params.load_conf_file(config_file))
		{
			std::wcerr << "Invalid configuration file, refer to the documentation for details!\n" << std::endl;
			return EXIT_FAILURE;
		}
	}

	//Parse command-line parameters
	if(!params.parse_cli_args(argc, argv))
	{
		std::wcerr << "Invalid command-line arguments, type \"INetGet.exe --help\" for details!\n" << std::endl;
		return EXIT_FAILURE;
	}

	//Show help screen, if it was requested
	if(params.getShowHelp())
	{
		print_help_screen();
		return EXIT_SUCCESS;
	}

	//Parse the specified source URL
	const std::wstring source = (params.getSource().compare(L"-") == 0) ? Utils::utf8_to_wide_str(stdin_get_line()) : params.getSource();
	URL url(source);
	if(!url.isComplete())
	{
		std::wcerr << "The specified URL is incomplete or unsupported:\n" << source << L'\n' << std::endl;
		return EXIT_FAILURE;
	}

	//Print request URL
	const std::wstring url_string = url.toString();
	std::wcerr << L"Request address:\n" << url.toString() << L'\n' << std::endl;
	Utils::set_console_title(std::wstring(L"INetGet - ").append(url_string));

	//Create the HTTP(S) client
	std::unique_ptr<AbstractClient> client;
	StatusListener listener;
	if(!create_client(client, listener, url.getScheme(), params))
	{
		std::wcerr << "Specified protocol is unsupported! Only HTTP(S) and FTP are allowed.\n" << std::endl;
		return EXIT_FAILURE;
	}

	//Retrieve the URL
	return retrieve_url(client.get(), url_string, params.getHttpVerb(), url, params.getPostData(), params.getReferrer(), params.getOutput(), params.getSetTimestamp(), params.getUpdateMode(), params.getEnableAlert(), params.getKeepFailed());
}
Exemple #4
0
int retrieve_remote_db_data (const char *dbname,
			     const char *varname,
			     char **getbuf,
			     int opt)
{
    return retrieve_url(dbhost, opt, dbname, varname, 
			NULL, 0, getbuf);
}
Exemple #5
0
uerr_t
retrieve_from_file (const char *file, bool html, int *count)
{
  uerr_t status;
  struct urlpos *url_list, *cur_url;

  url_list = (html ? get_urls_html (file, NULL, NULL)
              : get_urls_file (file));
  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */

  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
    {
      char *filename = NULL, *new_file = NULL;
      int dt;

      if (cur_url->ignore_when_downloading)
        continue;

      if (opt.quota && total_downloaded_bytes > opt.quota)
        {
          status = QUOTEXC;
          break;
        }
      if ((opt.recursive || opt.page_requisites)
          && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
        {
          int old_follow_ftp = opt.follow_ftp;

          /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
          if (cur_url->url->scheme == SCHEME_FTP)
            opt.follow_ftp = 1;

          status = retrieve_tree (cur_url->url->url);

          opt.follow_ftp = old_follow_ftp;
        }
      else
        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);

      if (filename && opt.delete_after && file_exists_p (filename))
        {
          DEBUGP (("\
Removing file due to --delete-after in retrieve_from_file():\n"));
          logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
          if (unlink (filename))
            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
          dt &= ~RETROKF;
        }

      xfree_null (new_file);
      xfree_null (filename);
    }
Exemple #6
0
bool
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
  struct iri *i = iri_new ();
  uerr_t err;
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
  int saved_ts_val = opt.timestamping;
  int saved_sp_val = opt.spider, url_err;
  struct url * url_parsed;

  /* Copy server URI encoding for a possible IDNA transformation, no need to
     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
  set_uri_encoding (i, iri->uri_encoding, false);
  i->utf8_encode = false;

  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
  *file = NULL;
  opt.timestamping = false;
  opt.spider       = false;

  url_parsed = url_parse (robots_url, &url_err, i, true);
  if (!url_parsed)
    {
      char *error = url_error (robots_url, url_err);
      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
      xfree (error);
      err = URLERROR;
    }
  else
    {
      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
                          false, i, false);
      url_free(url_parsed);
    }

  opt.timestamping = saved_ts_val;
  opt.spider       = saved_sp_val;
  xfree (robots_url);
  iri_free (i);

  if (err != RETROK && *file != NULL)
    {
      /* If the file is not retrieved correctly, but retrieve_url
         allocated the file name, deallocate is here so that the
         caller doesn't have to worry about it.  */
      xfree (*file);
      *file = NULL;
    }
  return err == RETROK;
}
Exemple #7
0
int retrieve_remote_function_package (const char *pkgname, 
				      const char *localname)
{
    int err;

    err = retrieve_url(gretlhost, GRAB_FUNC, pkgname, NULL, 
		       localname, 0, NULL);

    if (!err) {
	err = check_downloaded_file(localname, pkgname);
    }

    return err;
}
Exemple #8
0
/* Retrieves the robots_filename from the root server directory, if
   possible.  Returns ROBOTSOK if robots were retrieved OK, and
   NOROBOTS if robots could not be retrieved for any reason.  */
static uerr_t
retrieve_robots (const char *url, const char *robots_filename)
{
  int dt;
  uerr_t err;
  struct urlinfo *u;

  u = robots_url (url, robots_filename);
  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
  err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
  freeurl (u, 1);
  if (err == RETROK)
    return ROBOTSOK;
  else
    return NOROBOTS;
}
Exemple #9
0
Fichier : retr.c Projet : aosm/wget
/* Find the URL-s in the file and call retrieve_url() for each of
   them.  If HTML is non-zero, treat the file as HTML, and construct
   the URL-s accordingly.

   If opt.recursive is set, call recursive_retrieve() for each file.  */
uerr_t
retrieve_from_file (const char *file, int html, int *count)
{
  uerr_t status;
  urlpos *url_list, *cur_url;

  /* If spider-mode is on, we do not want get_urls_html barfing
     errors on baseless links.  */
  url_list = (html ? get_urls_html (file, NULL, opt.spider)
	      : get_urls_file (file));
  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */
  recursive_reset ();
  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
    {
      char *filename, *new_file;
      int dt;

      if (opt.quota && opt.downloaded > opt.quota)
	{
	  status = QUOTEXC;
	  break;
	}
      status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt);
      if (opt.recursive && status == RETROK && (dt & TEXTHTML))
	status = recursive_retrieve (filename, new_file ? new_file : cur_url->url);

      if (filename && opt.delete_after && file_exists_p (filename))
	{
	  logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
	  if (unlink (filename))
	    logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	  dt &= ~RETROKF;
	}

      FREE_MAYBE (new_file);
      FREE_MAYBE (filename);
    }

  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);

  return status;
}
Exemple #10
0
int check_remote_db (const char *dbname)
{
    char *getbuf = NULL;
    int err;

    err = retrieve_url(dbhost, CHECK_DB, dbname, NULL, 
		       NULL, 0, &getbuf);

    if (!err && getbuf != NULL) {
	err = strncmp(getbuf, "OK", 2) != 0;
    } 

    free(getbuf);

    if (err) {
	err = E_FOPEN;
    }

    return err;
}
Exemple #11
0
int
res_retrieve_file (const char *url, char **file)
{
    uerr_t err;
    char *robots_url = uri_merge (url, RES_SPECS_LOCATION);

    logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
    *file = NULL;
    err = retrieve_url (robots_url, file, NULL, NULL, NULL);
    xfree (robots_url);

    if (err != RETROK && *file != NULL)
    {
        /* If the file is not retrieved correctly, but retrieve_url
        allocated the file name, deallocate is here so that the
         caller doesn't have to worry about it.  */
        xfree (*file);
        *file = NULL;
    }
    return err == RETROK;
}
Exemple #12
0
/*!
@brief download a file.

@return uerr_t - the error value after downloading a file.
@see wget
*/
uerr_t classDownloader::getFile(bool bUseProgress)
{
	uerr_t err_value=RETROK;
	int index;

  	char *filename = NULL, *redirected_URL = NULL;
  	int dt;

  	if (opt.verbose == -1){
                opt.verbose = !opt.quiet;
	}

  	if (opt.verbose && opt.quiet){
  		cout<<"Can't be verbose and quiet at the same time."<<endl;
    		exit (1);
    	}

  	if (opt.timestamping && opt.noclobber){
    		cout<<"Can't timestamp and not clobber old files at the same time."<<endl;
    		exit (1);
    	}

  	if (opt.verbose){
    		set_progress_implementation (opt.progress_type);
	}

	if(bUseProgress){
		SetGetFileCallBack(GetFileCallBack);
	}else{
		SetGetFileCallBack(NULL);
	}
	
	pushFileName();
	
	err_value=retrieve_url(m_strUrl.c_str(), &filename, &redirected_URL, NULL, &dt);
  
  	return err_value;
}
Exemple #13
0
/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URLs stored to a linked list of URLs, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl, dash_p_leaf_HTML = FALSE;
  int meta_disallow_follow;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      /* These three operations need to be done only once per Wget
         run.  They should probably be at a different location.  */
      if (!undesirable_urls)
	undesirable_urls = make_string_hash_table (0);

      hash_table_clear (undesirable_urls);
      string_set_add (undesirable_urls, this_url);
      /* Enter this_url to the hash table, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  string_set_add (undesirable_urls, u->url);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
    /* We've exceeded the maximum recursion depth specified by the user. */
    {
      if (opt.page_requisites && depth <= opt.reclevel + 1)
	/* When -p is specified, we can do one more partial recursion from the
	   "leaf nodes" on the HTML document tree.  The recursion is partial in
	   that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
	   except for <LINK REL="stylesheet">. */
	dash_p_leaf_HTML = TRUE;
      else
	/* Either -p wasn't specified or it was and we've already gone the one
	   extra (pseudo-)level that it affords us, so we need to bail out. */
	{
	  DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
		   depth, opt.reclevel));
	  --depth;
	  return RECLEVELEXC;
	}
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
			    dash_p_leaf_HTML, &meta_disallow_follow);

  if (opt.use_robots && meta_disallow_follow)
    {
      /* The META tag says we are not to follow this file.  Respect
         that.  */
      free_urlpos (url_list);
      url_list = NULL;
    }

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (downloaded_exceeds_quota ())
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded regardless of
	 suffix rules (but that is remedied later with unlink) unless
	 the depth equals the maximum depth.

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 undesirable_urls.  Using it is crucial to avoid unnecessary
	 repeated continuous hits to the hash table.  */
      inl = string_set_contains (undesirable_urls, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !cur_url->link_relative_p)
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
            subject to acc/rej rules, unless a finite maximum depth has
            been specified and the current depth is the maximum depth. */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
                  && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
                      && ((opt.reclevel != INFINITE_RECURSION) &&
			  (depth != opt.reclevel))))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = TOLOWER (*p);
	      xfree (u->url);
	      u->url = str_url (u, 0);
	    }
	  xfree (constr);
	  constr = xstrdup (u->url);
	  string_set_add (undesirable_urls, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		string_set_add (undesirable_urls, constr);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  xfree (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  string_set_add (undesirable_urls, constr);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      xfree (constr);
	      constr = newloc;
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));

	  if (opt.delete_after || (filename && !acceptable (filename)))
	    /* Either --delete-after was specified, or we loaded this otherwise
	       rejected (e.g. by -R) HTML file just so we could harvest its
	       hyperlinks -- in either case, delete the local file. */
	    {
	      DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		       opt.delete_after ? "--delete-after" :
		       "recursive rejection criteria"));
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }

	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      else
	DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links && !opt.delete_after)
    /* This is merely the first pass: the links that have been
       successfully downloaded are converted.  In the second pass,
       convert_all_links() will also convert those links that have NOT
       been downloaded to their canonical form.  */
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  else
    return RETROK;
}
Exemple #14
0
/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URL-s stored to a linked list of URL-s, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      ulist = add_slist (ulist, this_url, 0);
      urls_downloaded = NULL;
      urls_html = NULL;
      /* Enter this_url to the slist, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  ulist = add_slist (ulist, u->url, 0);
	  urls_downloaded = add_url (urls_downloaded, u->url, file);
	  urls_html = add_slist (urls_html, file, NOSORT);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  /* Bail out if opt.reclevel is exceeded.  */
  if ((opt.reclevel != 0) && (depth > opt.reclevel))
    {
      DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
	       depth, opt.reclevel));
      --depth;
      return RECLEVELEXC;
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file,
			    canon_this_url ? canon_this_url : this_url, 0);

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (opt.quota && (opt.downloaded > opt.quota))
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded
	 regardless of suffix rules (but that is remedied later with
	 unlink).

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 ulist.  Using it is crucial to avoid the incessant calls to
	 in_slist, which is quite slow.  */
      inl = in_slist (ulist, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !(cur_url->flags & URELATIVE))
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
	     subject to acc/rej rules.  That's why the `!'.  */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
		   && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = tolower (*p);
	      free (u->url);
	      u->url = str_url (u, 0);
	    }
	  free (constr);
	  constr = xstrdup (u->url);
	  inl = in_slist (ulist, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		ulist = add_slist (ulist, constr, 0);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  free (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  ulist = add_slist (ulist, constr, 0);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      free (constr);
	      constr = newloc;
	    }
	  /* In case of convert_links: If there was no error, add it to
	     the list of downloaded URLs.  We might need it for
	     conversion.  */
	  if (opt.convert_links && filename)
	    {
	      if (dt & RETROKF)
		{
		  urls_downloaded = add_url (urls_downloaded, constr, filename);
		  /* If the URL is HTML, note it.  */
		  if (dt & TEXTHTML)
		    urls_html = add_slist (urls_html, filename, NOSORT);
		}
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));
	  /* If an suffix-rejected file was loaded only because it was HTML,
	     undo the error now */
	  if (opt.delete_after || (filename && !acceptable (filename)))
	    {
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }
	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->flags |= UABS2REL;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links)
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  else
    return RETROK;
}
Exemple #15
0
int list_remote_data_packages (char **getbuf)
{
    return retrieve_url(sfweb, LIST_PKGS, NULL, NULL, 
			NULL, 0, getbuf);
}
Exemple #16
0
int list_remote_function_categories (char **getbuf)
{
    return retrieve_url(gretlhost, LIST_CATS, NULL, NULL, 
			NULL, 0, getbuf);
}
Exemple #17
0
int query_sourceforge (const char *query, char **getbuf)
{
    return retrieve_url(sfweb, QUERY_SF, query, NULL, 
			NULL, 0, getbuf);
}
Exemple #18
0
int retrieve_remote_gfn_content (const char *zipname, 
				 const char *localname)
{
    return retrieve_url(gretlhost, GRAB_FUNC_INFO, zipname, NULL, 
			localname, 0, NULL);
}
uerr_t
retrieve_tree (const char *start_url)
{
  uerr_t status = RETROK;

  /* The queue of URLs we need to load. */
  struct url_queue *queue;

  /* The URLs we do not wish to enqueue, because they are already in
     the queue, but haven't been downloaded yet.  */
  struct hash_table *blacklist;

  int up_error_code;
  struct url *start_url_parsed = url_parse (start_url, &up_error_code);

  if (!start_url_parsed)
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
		 url_error (up_error_code));
      return URLERROR;
    }

  queue = url_queue_new ();
  blacklist = make_string_hash_table (0);

  /* Enqueue the starting URL.  Use start_url_parsed->url rather than
     just URL so we enqueue the canonical form of the URL.  */
  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
  string_set_add (blacklist, start_url_parsed->url);

  while (1)
    {
      int descend = 0;
      char *url, *referer, *file = NULL;
      int depth, html_allowed;
      boolean dash_p_leaf_HTML = FALSE;

      if (opt.quota && total_downloaded_bytes > opt.quota)
	break;
      if (status == FWRITEERR)
	break;

      /* Get the next URL from the queue... */

      if (!url_dequeue (queue,
			(const char **)&url, (const char **)&referer,
			&depth, &html_allowed))
	break;

      /* ...and download it.  Note that this download is in most cases
	 unconditional, as download_child_p already makes sure a file
	 doesn't get enqueued twice -- and yet this check is here, and
	 not in download_child_p.  This is so that if you run `wget -r
	 URL1 URL2', and a random URL is encountered once under URL1
	 and again under URL2, but at a different (possibly smaller)
	 depth, we want the URL's children to be taken into account
	 the second time.  */
      if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
	{
	  file = xstrdup (hash_table_get (dl_url_file_map, url));

	  DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
		   url, file));

	  if (html_allowed
	      && downloaded_html_set
	      && string_set_contains (downloaded_html_set, file))
	    descend = 1;
	}
      else
	{
	  int dt = 0;
	  char *redirected = NULL;
	  int oldrec = opt.recursive;

	  opt.recursive = 0;
	  status = retrieve_url (url, &file, &redirected, referer, &dt);
	  opt.recursive = oldrec;

	  if (html_allowed && file && status == RETROK
	      && (dt & RETROKF) && (dt & TEXTHTML))
	    descend = 1;

	  if (redirected)
	    {
	      /* We have been redirected, possibly to another host, or
		 different path, or wherever.  Check whether we really
		 want to follow it.  */
	      if (descend)
		{
		  if (!descend_redirect_p (redirected, url, depth,
					   start_url_parsed, blacklist))
		    descend = 0;
		  else
		    /* Make sure that the old pre-redirect form gets
		       blacklisted. */
		    string_set_add (blacklist, url);
		}

	      xfree (url);
	      url = redirected;
	    }
	}

      if (descend
	  && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
	{
	  if (opt.page_requisites
	      && (depth == opt.reclevel || depth == opt.reclevel + 1))
	    {
	      /* When -p is specified, we are allowed to exceed the
		 maximum depth, but only for the "inline" links,
		 i.e. those that are needed to display the page.
		 Originally this could exceed the depth at most by
		 one, but we allow one more level so that the leaf
		 pages that contain frames can be loaded
		 correctly.  */
	      dash_p_leaf_HTML = TRUE;
	    }
	  else
	    {
	      /* Either -p wasn't specified or it was and we've
		 already spent the two extra (pseudo-)levels that it
		 affords us, so we need to bail out. */
	      DEBUGP (("Not descending further; at depth %d, max. %d.\n",
		       depth, opt.reclevel));
	      descend = 0;
	    }
	}

      /* If the downloaded document was HTML, parse it and enqueue the
	 links it contains. */

      if (descend)
	{
	  int meta_disallow_follow = 0;
	  struct urlpos *children
	    = get_urls_html (file, url, &meta_disallow_follow);

	  if (opt.use_robots && meta_disallow_follow)
	    {
	      free_urlpos (children);
	      children = NULL;
	    }

	  if (children)
	    {
	      struct urlpos *child = children;
	      struct url *url_parsed = url_parsed = url_parse (url, NULL);
	      assert (url_parsed != NULL);

	      for (; child; child = child->next)
		{
		  if (child->ignore_when_downloading)
		    continue;
		  if (dash_p_leaf_HTML && !child->link_inline_p)
		    continue;
		  if (download_child_p (child, url_parsed, depth, start_url_parsed,
					blacklist))
		    {
		      url_enqueue (queue, xstrdup (child->url->url),
				   xstrdup (url), depth + 1,
				   child->link_expect_html);
		      /* We blacklist the URL we have enqueued, because we
			 don't want to enqueue (and hence download) the
			 same URL twice.  */
		      string_set_add (blacklist, child->url->url);
		    }
		}

	      url_free (url_parsed);
	      free_urlpos (children);
	    }
	}

      if (opt.delete_after || (file && !acceptable (file)))
	{
	  /* Either --delete-after was specified, or we loaded this
	     otherwise rejected (e.g. by -R) HTML file just so we
	     could harvest its hyperlinks -- in either case, delete
	     the local file. */
	  DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		   opt.delete_after ? "--delete-after" :
		   "recursive rejection criteria"));
	  logprintf (LOG_VERBOSE,
		     (opt.delete_after
		      ? _("Removing %s.\n")
		      : _("Removing %s since it should be rejected.\n")),
		     file);
	  if (unlink (file))
	    logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	  register_delete_file (file);
	}

      xfree (url);
      FREE_MAYBE (referer);
      FREE_MAYBE (file);
    }

  /* If anything is left of the queue due to a premature exit, free it
     now.  */
  {
    char *d1, *d2;
    int d3, d4;
    while (url_dequeue (queue,
			(const char **)&d1, (const char **)&d2, &d3, &d4))
      {
	xfree (d1);
	FREE_MAYBE (d2);
      }
  }
  url_queue_delete (queue);

  if (start_url_parsed)
    url_free (start_url_parsed);
  string_set_free (blacklist);

  if (opt.quota && total_downloaded_bytes > opt.quota)
    return QUOTEXC;
  else if (status == FWRITEERR)
    return FWRITEERR;
  else
    return RETROK;
}
Exemple #20
0
int list_remote_function_packages (char **getbuf, int filter)
{
    return retrieve_url(gretlhost, LIST_FUNCS, NULL, NULL, 
			NULL, filter, getbuf);
}
Exemple #21
0
int retrieve_remote_datafiles_package (const char *pkgname, 
				       const char *localname)
{
    return retrieve_url(sffiles, GRAB_PKG, pkgname, NULL, 
			localname, 0, NULL);
}
Exemple #22
0
int retrieve_remote_db_index (const char *dbname, char **getbuf) 
{
    return retrieve_url(dbhost, GRAB_IDX, dbname, NULL, 
			NULL, 0, getbuf);
}
Exemple #23
0
/* Loop through all files in metalink structure and retrieve them.
   Returns RETROK if all files were downloaded.
   Returns last retrieval error (from retrieve_url) if some files
   could not be downloaded.  */
uerr_t
retrieve_from_metalink (const metalink_t* metalink)
{
  metalink_file_t **mfile_ptr;
  uerr_t last_retr_err = RETROK; /* Store last encountered retrieve error.  */

  FILE *_output_stream = output_stream;
  bool _output_stream_regular = output_stream_regular;
  char *_output_document = opt.output_document;

  DEBUGP (("Retrieving from Metalink\n"));

  /* No files to download.  */
  if (!metalink->files)
    return RETROK;

  if (opt.output_document)
    {
      /* We cannot support output_document as we need to compute checksum
         of downloaded file, and to remove it if the checksum is bad.  */
      logputs (LOG_NOTQUIET,
               _("-O not supported for metalink download. Ignoring.\n"));
    }

  for (mfile_ptr = metalink->files; *mfile_ptr; mfile_ptr++)
    {
      metalink_file_t *mfile = *mfile_ptr;
      metalink_resource_t **mres_ptr;
      char *filename = NULL;
      bool hash_ok = false;

      uerr_t retr_err = METALINK_MISSING_RESOURCE;

      /* -1 -> file should be rejected
         0 -> could not verify
         1 -> verified successfully  */
      char sig_status = 0;

      output_stream = NULL;

      DEBUGP (("Processing metalink file %s...\n", quote (mfile->name)));

      /* Resources are sorted by priority.  */
      for (mres_ptr = mfile->resources; *mres_ptr; mres_ptr++)
        {
          metalink_resource_t *mres = *mres_ptr;
          metalink_checksum_t **mchksum_ptr, *mchksum;
          struct iri *iri;
          struct url *url;
          int url_err;

          if (!RES_TYPE_SUPPORTED (mres->type))
            {
              logprintf (LOG_VERBOSE,
                         _("Resource type %s not supported, ignoring...\n"),
                         quote (mres->type));
              continue;
            }

          retr_err = METALINK_RETR_ERROR;

          /* If output_stream is not NULL, then we have failed on
             previous resource and are retrying. Thus, remove the file.  */
          if (output_stream)
            {
              fclose (output_stream);
              output_stream = NULL;
              if (unlink (filename))
                logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
              xfree (filename);
            }

          /* Parse our resource URL.  */
          iri = iri_new ();
          set_uri_encoding (iri, opt.locale, true);
          url = url_parse (mres->url, &url_err, iri, false);

          if (!url)
            {
              char *error = url_error (mres->url, url_err);
              logprintf (LOG_NOTQUIET, "%s: %s.\n", mres->url, error);
              xfree (error);
              inform_exit_status (URLERROR);
              iri_free (iri);
              continue;
            }
          else
            {
              /* Avoid recursive Metalink from HTTP headers.  */
              bool _metalink_http = opt.metalink_over_http;

              /* Assure proper local file name regardless of the URL
                 of particular Metalink resource.
                 To do that we create the local file here and put
                 it as output_stream. We restore the original configuration
                 after we are finished with the file.  */
              output_stream = unique_create (mfile->name, true, &filename);
              output_stream_regular = true;

              /* Store the real file name for displaying in messages.  */
              opt.output_document = filename;

              opt.metalink_over_http = false;
              DEBUGP (("Storing to %s\n", filename));
              retr_err = retrieve_url (url, mres->url, NULL, NULL,
                                       NULL, NULL, opt.recursive, iri, false);
              opt.metalink_over_http = _metalink_http;
            }
          url_free (url);
          iri_free (iri);

          if (retr_err == RETROK)
            {
              FILE *local_file;

              /* Check the digest.  */
              local_file = fopen (filename, "rb");
              if (!local_file)
                {
                  logprintf (LOG_NOTQUIET, _("Could not open downloaded file.\n"));
                  continue;
                }

              for (mchksum_ptr = mfile->checksums; *mchksum_ptr; mchksum_ptr++)
                {
                  char sha256[SHA256_DIGEST_SIZE];
                  char sha256_txt[2 * SHA256_DIGEST_SIZE + 1];

                  mchksum = *mchksum_ptr;

                  /* I have seen both variants...  */
                  if (strcasecmp (mchksum->type, "sha256")
                      && strcasecmp (mchksum->type, "sha-256"))
                    {
                      DEBUGP (("Ignoring unsupported checksum type %s.\n",
                               quote (mchksum->type)));
                      continue;
                    }

                  logprintf (LOG_VERBOSE, _("Computing checksum for %s\n"),
                             quote (mfile->name));

                  sha256_stream (local_file, sha256);
                  wg_hex_to_string (sha256_txt, sha256, SHA256_DIGEST_SIZE);
                  DEBUGP (("Declared hash: %s\n", mchksum->hash));
                  DEBUGP (("Computed hash: %s\n", sha256_txt));
                  if (!strcmp (sha256_txt, mchksum->hash))
                    {
                      logputs (LOG_VERBOSE,
                               _("Checksum matches.\n"));
                      hash_ok = true;
                    }
                  else
                    {
                      logprintf (LOG_NOTQUIET,
                                 _("Checksum mismatch for file %s.\n"),
                                 quote (mfile->name));
                      hash_ok = false;
                    }

                  /* Stop as soon as we checked the supported checksum.  */
                  break;
                } /* Iterate over available checksums.  */
              fclose (local_file);
              local_file = NULL;

              if (!hash_ok)
                continue;

              sig_status = 0; /* Not verified.  */

#ifdef HAVE_GPGME
              /* Check the crypto signature.

                 Note that the signtures from Metalink in XML will not be
                 parsed when using libmetalink version older than 0.1.3.
                 Metalink-over-HTTP is not affected by this problem.  */
              if (mfile->signature)
                {
                  metalink_signature_t *msig = mfile->signature;
                  gpgme_error_t gpgerr;
                  gpgme_ctx_t gpgctx;
                  gpgme_data_t gpgsigdata, gpgdata;
                  gpgme_verify_result_t gpgres;
                  gpgme_signature_t gpgsig;
                  gpgme_protocol_t gpgprot = GPGME_PROTOCOL_UNKNOWN;
                  int fd = -1;

                  /* Initialize the library - as name suggests.  */
                  gpgme_check_version (NULL);

                  /* Open data file.  */
                  fd = open (filename, O_RDONLY);
                  if (fd == -1)
                    {
                      logputs (LOG_NOTQUIET,
                               _("Could not open downloaded file for signature "
                                 "verification.\n"));
                      goto gpg_skip_verification;
                    }

                  /* Assign file descriptor to GPG data structure.  */
                  gpgerr = gpgme_data_new_from_fd (&gpgdata, fd);
                  if (gpgerr != GPG_ERR_NO_ERROR)
                    {
                      logprintf (LOG_NOTQUIET,
                                 "GPGME data_new_from_fd: %s\n",
                                 gpgme_strerror (gpgerr));
                      goto gpg_skip_verification;
                    }

                  /* Prepare new GPGME context.  */
                  gpgerr = gpgme_new (&gpgctx);
                  if (gpgerr != GPG_ERR_NO_ERROR)
                    {
                      logprintf (LOG_NOTQUIET,
                                 "GPGME new: %s\n",
                                 gpgme_strerror (gpgerr));
                      gpgme_data_release (gpgdata);
                      goto gpg_skip_verification;
                    }

                  DEBUGP (("Verifying signature %s:\n%s\n",
                           quote (msig->mediatype),
                           msig->signature));

                  /* Check signature type.  */
                  if (!strcmp (msig->mediatype, "application/pgp-signature"))
                    gpgprot = GPGME_PROTOCOL_OpenPGP;
                  else /* Unsupported signature type.  */
                    {
                      gpgme_release (gpgctx);
                      gpgme_data_release (gpgdata);
                      goto gpg_skip_verification;
                    }

                  gpgerr = gpgme_set_protocol (gpgctx, gpgprot);
                  if (gpgerr != GPG_ERR_NO_ERROR)
                    {
                      logprintf (LOG_NOTQUIET,
                                 "GPGME set_protocol: %s\n",
                                 gpgme_strerror (gpgerr));
                      gpgme_release (gpgctx);
                      gpgme_data_release (gpgdata);
                      goto gpg_skip_verification;
                    }

                  /* Load the signature.  */
                  gpgerr = gpgme_data_new_from_mem (&gpgsigdata,
                                                    msig->signature,
                                                    strlen (msig->signature),
                                                    0);
                  if (gpgerr != GPG_ERR_NO_ERROR)
                    {
                      logprintf (LOG_NOTQUIET,
                                 _("GPGME data_new_from_mem: %s\n"),
                                 gpgme_strerror (gpgerr));
                      gpgme_release (gpgctx);
                      gpgme_data_release (gpgdata);
                      goto gpg_skip_verification;
                    }

                  /* Verify the signature.  */
                  gpgerr = gpgme_op_verify (gpgctx, gpgsigdata, gpgdata, NULL);
                  if (gpgerr != GPG_ERR_NO_ERROR)
                    {
                      logprintf (LOG_NOTQUIET,
                                 _("GPGME op_verify: %s\n"),
                                 gpgme_strerror (gpgerr));
                      gpgme_data_release (gpgsigdata);
                      gpgme_release (gpgctx);
                      gpgme_data_release (gpgdata);
                      goto gpg_skip_verification;
                    }

                  /* Check the results.  */
                  gpgres = gpgme_op_verify_result (gpgctx);
                  if (!gpgres)
                    {
                      logputs (LOG_NOTQUIET,
                               _("GPGME op_verify_result: NULL\n"));
                      gpgme_data_release (gpgsigdata);
                      gpgme_release (gpgctx);
                      gpgme_data_release (gpgdata);
                      goto gpg_skip_verification;
                    }

                  /* The list is null-terminated.  */
                  for (gpgsig = gpgres->signatures; gpgsig; gpgsig = gpgsig->next)
                    {
                      DEBUGP (("Checking signature %s\n", gpgsig->fpr));

                      if (gpgsig->summary
                          & (GPGME_SIGSUM_VALID | GPGME_SIGSUM_GREEN))
                        {
                          logputs (LOG_VERBOSE,
                                   _("Signature validation suceeded.\n"));
                          sig_status = 1;
                          break;
                        }

                      if (gpgsig->summary & GPGME_SIGSUM_RED)
                        {
                          logputs (LOG_NOTQUIET,
                                   _("Invalid signature. Rejecting resource.\n"));
                          sig_status = -1;
                          break;
                        }

                      if (gpgsig->summary == 0
                          && (gpgsig->status & 0xFFFF) == GPG_ERR_NO_ERROR)
                        {
                          logputs (LOG_VERBOSE,
                                   _("Data matches signature, but signature "
                                     "is not trusted.\n"));
                        }

                      if ((gpgsig->status & 0xFFFF) != GPG_ERR_NO_ERROR)
                        {
                          logprintf (LOG_NOTQUIET,
                                     "GPGME: %s\n",
                                     gpgme_strerror (gpgsig->status & 0xFFFF));
                        }
                    }
                  gpgme_data_release (gpgsigdata);
                  gpgme_release (gpgctx);
                  gpgme_data_release (gpgdata);
gpg_skip_verification:
                  if (fd != -1)
                    close (fd);
                } /* endif (mfile->signature) */
#endif
              /* Stop if file was downloaded with success.  */
              if (sig_status >= 0)
                break;
            } /* endif RETR_OK.  */
        } /* Iterate over resources.  */

      if (retr_err != RETROK)
        {
          logprintf (LOG_VERBOSE, _("Failed to download %s. Skipping resource.\n"),
                     quote (mfile->name));
        }
      else if (!hash_ok)
        {
          retr_err = METALINK_CHKSUM_ERROR;
          logprintf (LOG_NOTQUIET,
                     _("File %s retrieved but checksum does not match. "
                       "\n"), quote (mfile->name));
        }
#ifdef HAVE_GPGME
        /* Signature will be only validated if hash check was successful.  */
      else if (sig_status < 0)
        {
          retr_err = METALINK_SIG_ERROR;
          logprintf (LOG_NOTQUIET,
                     _("File %s retrieved but signature does not match. "
                       "\n"), quote (mfile->name));
        }
#endif
      last_retr_err = retr_err == RETROK ? last_retr_err : retr_err;

      /* Remove the file if error encountered or if option specified.
         Note: the file has been downloaded using *_loop. Therefore, it
         is not necessary to keep the file for continuated download.  */
      if ((retr_err != RETROK || opt.delete_after)
           && filename != NULL && file_exists_p (filename))
        {
          logprintf (LOG_VERBOSE, _("Removing %s.\n"), quote (filename));
          if (unlink (filename))
            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
        }
      if (output_stream)
        {
          fclose (output_stream);
          output_stream = NULL;
        }
      xfree (filename);
    } /* Iterate over files.  */

  /* Restore original values.  */
  opt.output_document = _output_document;
  output_stream_regular = _output_stream_regular;
  output_stream = _output_stream;

  return last_retr_err;
}
Exemple #24
0
int retrieve_manfile (const char *fname, const char *localname)
{
    return retrieve_url(sffiles, GRAB_PDF, fname, NULL, 
			localname, 0, NULL);
}
Exemple #25
0
uerr_t
retrieve_from_file (const char *file, bool html, int *count)
{
  uerr_t status;
  struct urlpos *url_list, *cur_url;
  struct iri *iri = iri_new();

  char *input_file, *url_file = NULL;
  const char *url = file;

  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */

  /* sXXXav : Assume filename and links in the file are in the locale */
  set_uri_encoding (iri, opt.locale, true);
  set_content_encoding (iri, opt.locale);

  if (url_valid_scheme (url))
    {
      int dt,url_err;
      uerr_t status;
      struct url *url_parsed = url_parse (url, &url_err, iri, true);
      if (!url_parsed)
        {
          char *error = url_error (url, url_err);
          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
          xfree (error);
          return URLERROR;
        }

      if (!opt.base_href)
        opt.base_href = xstrdup (url);

      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
                             false, iri, true);
      url_free (url_parsed);

      if (!url_file || (status != RETROK))
        return status;

      if (dt & TEXTHTML)
        html = true;

      /* If we have a found a content encoding, use it.
       * ( == is okay, because we're checking for identical object) */
      if (iri->content_encoding != opt.locale)
	  set_uri_encoding (iri, iri->content_encoding, false);

      /* Reset UTF-8 encode status */
      iri->utf8_encode = opt.enable_iri;
      xfree_null (iri->orig_url);
      iri->orig_url = NULL;

      input_file = url_file;
    }
  else
    input_file = (char *) file;

  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
              : get_urls_file (input_file));

  xfree_null (url_file);

  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
    {
      char *filename = NULL, *new_file = NULL;
      int dt;
      struct iri *tmpiri = iri_dup (iri);
      struct url *parsed_url = NULL;

      if (cur_url->ignore_when_downloading)
        continue;

      if (opt.quota && total_downloaded_bytes > opt.quota)
        {
          status = QUOTEXC;
          break;
        }

      parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);

      if ((opt.recursive || opt.page_requisites)
          && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
        {
          int old_follow_ftp = opt.follow_ftp;

          /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
          if (cur_url->url->scheme == SCHEME_FTP)
            opt.follow_ftp = 1;

          status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
                                  tmpiri);

          opt.follow_ftp = old_follow_ftp;
        }
      else
        status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
                               cur_url->url->url, &filename,
                               &new_file, NULL, &dt, opt.recursive, tmpiri,
                               true);

      if (parsed_url)
          url_free (parsed_url);

      if (filename && opt.delete_after && file_exists_p (filename))
        {
          DEBUGP (("\
Removing file due to --delete-after in retrieve_from_file():\n"));
          logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
          if (unlink (filename))
            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
          dt &= ~RETROKF;
        }

      xfree_null (new_file);
      xfree_null (filename);
      iri_free (tmpiri);
    }
Exemple #26
0
int list_remote_dbs (char **getbuf)
{
    return retrieve_url(dbhost, LIST_DBS, NULL, NULL, 
			NULL, 0, getbuf);
}