bool res_retrieve_file (const char *url, char **file, struct iri *iri) { struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; int saved_sp_val = opt.spider, url_err; struct url * url_parsed; /* Copy server URI encoding for a possible IDNA transformation, no need to encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ set_uri_encoding (i, iri->uri_encoding, false); i->utf8_encode = false; logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; url_parsed = url_parse (robots_url, &url_err, i, true); if (!url_parsed) { char *error = url_error (robots_url, url_err); logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); xfree (error); err = URLERROR; } else { err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, false, i, false); url_free(url_parsed); } opt.timestamping = saved_ts_val; opt.spider = saved_sp_val; xfree (robots_url); iri_free (i); if (err != RETROK && *file != NULL) { /* If the file is not retrieved correctly, but retrieve_url allocated the file name, deallocate is here so that the caller doesn't have to worry about it. */ xfree (*file); *file = NULL; } return err == RETROK; }
static void convert_links_in_hashtable (struct hash_table *downloaded_set, int is_css, int *file_count) { int i; int cnt; char **file_array; cnt = 0; if (downloaded_set) cnt = hash_table_count (downloaded_set); if (cnt == 0) return; file_array = alloca_array (char *, cnt); string_set_to_array (downloaded_set, file_array); for (i = 0; i < cnt; i++) { struct urlpos *urls, *cur_url; char *url; char *file = file_array[i]; /* Determine the URL of the file. get_urls_{html,css} will need it. */ url = hash_table_get (dl_file_url_map, file); if (!url) { DEBUGP (("Apparently %s has been removed.\n", file)); continue; } DEBUGP (("Scanning %s (from %s)\n", file, url)); /* Parse the file... */ urls = is_css ? get_urls_css_file (file, url) : get_urls_html (file, url, NULL, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the links that have been followed from other files. */ for (cur_url = urls; cur_url; cur_url = cur_url->next) { char *local_name; struct url *u; struct iri *pi; if (cur_url->link_base_p) { /* Base references have been resolved by our parser, so we turn the base URL into an empty string. (Perhaps we should remove the tag entirely?) */ cur_url->convert = CO_NULLIFY_BASE; continue; } /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted ABS2REL, whereas non-downloaded will be converted REL2ABS. */ pi = iri_new (); set_uri_encoding (pi, opt.locale, true); u = url_parse (cur_url->url->url, NULL, pi, true); if (!u) continue; local_name = hash_table_get (dl_url_file_map, u->url); /* Decide on the conversion type. */ if (local_name) { /* We've downloaded this URL. Convert it to relative form. We do this even if the URL already is in relative form, because our directory structure may not be identical to that on the server (think `-nd', `--cut-dirs', etc.) */ cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->local_name = xstrdup (local_name); DEBUGP (("will convert url %s to local %s\n", u->url, local_name)); } else { /* We haven't downloaded this URL. If it's not already complete (including a full host name), convert it to that form, so it can be reached while browsing this HTML locally. */ if (!cur_url->link_complete_p) cur_url->convert = CO_CONVERT_TO_COMPLETE; cur_url->local_name = NULL; DEBUGP (("will convert url %s to complete\n", u->url)); } url_free (u); iri_free (pi); } /* Convert the links in the file. */ convert_links (file, urls); ++*file_count; /* Free the data. */ free_urlpos (urls); } }
uerr_t retrieve_from_file (const char *file, bool html, int *count) { uerr_t status; struct urlpos *url_list, *cur_url; struct iri *iri = iri_new(); char *input_file, *url_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ /* sXXXav : Assume filename and links in the file are in the locale */ set_uri_encoding (iri, opt.locale, true); set_content_encoding (iri, opt.locale); if (url_valid_scheme (url)) { int dt,url_err; uerr_t status; struct url *url_parsed = url_parse (url, &url_err, iri, true); if (!url_parsed) { char *error = url_error (url, url_err); logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); xfree (error); return URLERROR; } if (!opt.base_href) opt.base_href = xstrdup (url); status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt, false, iri, true); url_free (url_parsed); if (!url_file || (status != RETROK)) return status; if (dt & TEXTHTML) html = true; /* If we have a found a content encoding, use it. * ( == is okay, because we're checking for identical object) */ if (iri->content_encoding != opt.locale) set_uri_encoding (iri, iri->content_encoding, false); /* Reset UTF-8 encode status */ iri->utf8_encode = opt.enable_iri; xfree_null (iri->orig_url); iri->orig_url = NULL; input_file = url_file; } else input_file = (char *) file; url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); xfree_null (url_file); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) { char *filename = NULL, *new_file = NULL; int dt; struct iri *tmpiri = iri_dup (iri); struct url *parsed_url = NULL; if (cur_url->ignore_when_downloading) continue; if (opt.quota && total_downloaded_bytes > opt.quota) { status = QUOTEXC; break; } parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true); if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; status = retrieve_tree (parsed_url ? parsed_url : cur_url->url, tmpiri); opt.follow_ftp = old_follow_ftp; } else status = retrieve_url (parsed_url ? parsed_url : cur_url->url, cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive, tmpiri, true); if (parsed_url) url_free (parsed_url); if (filename && opt.delete_after && file_exists_p (filename)) { DEBUGP (("\ Removing file due to --delete-after in retrieve_from_file():\n")); logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); if (unlink (filename)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); dt &= ~RETROKF; } xfree_null (new_file); xfree_null (filename); iri_free (tmpiri); }
/* Loop through all files in metalink structure and retrieve them. Returns RETROK if all files were downloaded. Returns last retrieval error (from retrieve_url) if some files could not be downloaded. */ uerr_t retrieve_from_metalink (const metalink_t* metalink) { metalink_file_t **mfile_ptr; uerr_t last_retr_err = RETROK; /* Store last encountered retrieve error. */ FILE *_output_stream = output_stream; bool _output_stream_regular = output_stream_regular; char *_output_document = opt.output_document; DEBUGP (("Retrieving from Metalink\n")); /* No files to download. */ if (!metalink->files) return RETROK; if (opt.output_document) { /* We cannot support output_document as we need to compute checksum of downloaded file, and to remove it if the checksum is bad. */ logputs (LOG_NOTQUIET, _("-O not supported for metalink download. Ignoring.\n")); } for (mfile_ptr = metalink->files; *mfile_ptr; mfile_ptr++) { metalink_file_t *mfile = *mfile_ptr; metalink_resource_t **mres_ptr; char *filename = NULL; bool hash_ok = false; uerr_t retr_err = METALINK_MISSING_RESOURCE; /* -1 -> file should be rejected 0 -> could not verify 1 -> verified successfully */ char sig_status = 0; output_stream = NULL; DEBUGP (("Processing metalink file %s...\n", quote (mfile->name))); /* Resources are sorted by priority. */ for (mres_ptr = mfile->resources; *mres_ptr; mres_ptr++) { metalink_resource_t *mres = *mres_ptr; metalink_checksum_t **mchksum_ptr, *mchksum; struct iri *iri; struct url *url; int url_err; if (!RES_TYPE_SUPPORTED (mres->type)) { logprintf (LOG_VERBOSE, _("Resource type %s not supported, ignoring...\n"), quote (mres->type)); continue; } retr_err = METALINK_RETR_ERROR; /* If output_stream is not NULL, then we have failed on previous resource and are retrying. Thus, remove the file. */ if (output_stream) { fclose (output_stream); output_stream = NULL; if (unlink (filename)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); xfree (filename); } /* Parse our resource URL. */ iri = iri_new (); set_uri_encoding (iri, opt.locale, true); url = url_parse (mres->url, &url_err, iri, false); if (!url) { char *error = url_error (mres->url, url_err); logprintf (LOG_NOTQUIET, "%s: %s.\n", mres->url, error); xfree (error); inform_exit_status (URLERROR); iri_free (iri); continue; } else { /* Avoid recursive Metalink from HTTP headers. */ bool _metalink_http = opt.metalink_over_http; /* Assure proper local file name regardless of the URL of particular Metalink resource. To do that we create the local file here and put it as output_stream. We restore the original configuration after we are finished with the file. */ output_stream = unique_create (mfile->name, true, &filename); output_stream_regular = true; /* Store the real file name for displaying in messages. */ opt.output_document = filename; opt.metalink_over_http = false; DEBUGP (("Storing to %s\n", filename)); retr_err = retrieve_url (url, mres->url, NULL, NULL, NULL, NULL, opt.recursive, iri, false); opt.metalink_over_http = _metalink_http; } url_free (url); iri_free (iri); if (retr_err == RETROK) { FILE *local_file; /* Check the digest. */ local_file = fopen (filename, "rb"); if (!local_file) { logprintf (LOG_NOTQUIET, _("Could not open downloaded file.\n")); continue; } for (mchksum_ptr = mfile->checksums; *mchksum_ptr; mchksum_ptr++) { char sha256[SHA256_DIGEST_SIZE]; char sha256_txt[2 * SHA256_DIGEST_SIZE + 1]; mchksum = *mchksum_ptr; /* I have seen both variants... */ if (strcasecmp (mchksum->type, "sha256") && strcasecmp (mchksum->type, "sha-256")) { DEBUGP (("Ignoring unsupported checksum type %s.\n", quote (mchksum->type))); continue; } logprintf (LOG_VERBOSE, _("Computing checksum for %s\n"), quote (mfile->name)); sha256_stream (local_file, sha256); wg_hex_to_string (sha256_txt, sha256, SHA256_DIGEST_SIZE); DEBUGP (("Declared hash: %s\n", mchksum->hash)); DEBUGP (("Computed hash: %s\n", sha256_txt)); if (!strcmp (sha256_txt, mchksum->hash)) { logputs (LOG_VERBOSE, _("Checksum matches.\n")); hash_ok = true; } else { logprintf (LOG_NOTQUIET, _("Checksum mismatch for file %s.\n"), quote (mfile->name)); hash_ok = false; } /* Stop as soon as we checked the supported checksum. */ break; } /* Iterate over available checksums. */ fclose (local_file); local_file = NULL; if (!hash_ok) continue; sig_status = 0; /* Not verified. */ #ifdef HAVE_GPGME /* Check the crypto signature. Note that the signtures from Metalink in XML will not be parsed when using libmetalink version older than 0.1.3. Metalink-over-HTTP is not affected by this problem. */ if (mfile->signature) { metalink_signature_t *msig = mfile->signature; gpgme_error_t gpgerr; gpgme_ctx_t gpgctx; gpgme_data_t gpgsigdata, gpgdata; gpgme_verify_result_t gpgres; gpgme_signature_t gpgsig; gpgme_protocol_t gpgprot = GPGME_PROTOCOL_UNKNOWN; int fd = -1; /* Initialize the library - as name suggests. */ gpgme_check_version (NULL); /* Open data file. */ fd = open (filename, O_RDONLY); if (fd == -1) { logputs (LOG_NOTQUIET, _("Could not open downloaded file for signature " "verification.\n")); goto gpg_skip_verification; } /* Assign file descriptor to GPG data structure. */ gpgerr = gpgme_data_new_from_fd (&gpgdata, fd); if (gpgerr != GPG_ERR_NO_ERROR) { logprintf (LOG_NOTQUIET, "GPGME data_new_from_fd: %s\n", gpgme_strerror (gpgerr)); goto gpg_skip_verification; } /* Prepare new GPGME context. */ gpgerr = gpgme_new (&gpgctx); if (gpgerr != GPG_ERR_NO_ERROR) { logprintf (LOG_NOTQUIET, "GPGME new: %s\n", gpgme_strerror (gpgerr)); gpgme_data_release (gpgdata); goto gpg_skip_verification; } DEBUGP (("Verifying signature %s:\n%s\n", quote (msig->mediatype), msig->signature)); /* Check signature type. */ if (!strcmp (msig->mediatype, "application/pgp-signature")) gpgprot = GPGME_PROTOCOL_OpenPGP; else /* Unsupported signature type. */ { gpgme_release (gpgctx); gpgme_data_release (gpgdata); goto gpg_skip_verification; } gpgerr = gpgme_set_protocol (gpgctx, gpgprot); if (gpgerr != GPG_ERR_NO_ERROR) { logprintf (LOG_NOTQUIET, "GPGME set_protocol: %s\n", gpgme_strerror (gpgerr)); gpgme_release (gpgctx); gpgme_data_release (gpgdata); goto gpg_skip_verification; } /* Load the signature. */ gpgerr = gpgme_data_new_from_mem (&gpgsigdata, msig->signature, strlen (msig->signature), 0); if (gpgerr != GPG_ERR_NO_ERROR) { logprintf (LOG_NOTQUIET, _("GPGME data_new_from_mem: %s\n"), gpgme_strerror (gpgerr)); gpgme_release (gpgctx); gpgme_data_release (gpgdata); goto gpg_skip_verification; } /* Verify the signature. */ gpgerr = gpgme_op_verify (gpgctx, gpgsigdata, gpgdata, NULL); if (gpgerr != GPG_ERR_NO_ERROR) { logprintf (LOG_NOTQUIET, _("GPGME op_verify: %s\n"), gpgme_strerror (gpgerr)); gpgme_data_release (gpgsigdata); gpgme_release (gpgctx); gpgme_data_release (gpgdata); goto gpg_skip_verification; } /* Check the results. */ gpgres = gpgme_op_verify_result (gpgctx); if (!gpgres) { logputs (LOG_NOTQUIET, _("GPGME op_verify_result: NULL\n")); gpgme_data_release (gpgsigdata); gpgme_release (gpgctx); gpgme_data_release (gpgdata); goto gpg_skip_verification; } /* The list is null-terminated. */ for (gpgsig = gpgres->signatures; gpgsig; gpgsig = gpgsig->next) { DEBUGP (("Checking signature %s\n", gpgsig->fpr)); if (gpgsig->summary & (GPGME_SIGSUM_VALID | GPGME_SIGSUM_GREEN)) { logputs (LOG_VERBOSE, _("Signature validation suceeded.\n")); sig_status = 1; break; } if (gpgsig->summary & GPGME_SIGSUM_RED) { logputs (LOG_NOTQUIET, _("Invalid signature. Rejecting resource.\n")); sig_status = -1; break; } if (gpgsig->summary == 0 && (gpgsig->status & 0xFFFF) == GPG_ERR_NO_ERROR) { logputs (LOG_VERBOSE, _("Data matches signature, but signature " "is not trusted.\n")); } if ((gpgsig->status & 0xFFFF) != GPG_ERR_NO_ERROR) { logprintf (LOG_NOTQUIET, "GPGME: %s\n", gpgme_strerror (gpgsig->status & 0xFFFF)); } } gpgme_data_release (gpgsigdata); gpgme_release (gpgctx); gpgme_data_release (gpgdata); gpg_skip_verification: if (fd != -1) close (fd); } /* endif (mfile->signature) */ #endif /* Stop if file was downloaded with success. */ if (sig_status >= 0) break; } /* endif RETR_OK. */ } /* Iterate over resources. */ if (retr_err != RETROK) { logprintf (LOG_VERBOSE, _("Failed to download %s. Skipping resource.\n"), quote (mfile->name)); } else if (!hash_ok) { retr_err = METALINK_CHKSUM_ERROR; logprintf (LOG_NOTQUIET, _("File %s retrieved but checksum does not match. " "\n"), quote (mfile->name)); } #ifdef HAVE_GPGME /* Signature will be only validated if hash check was successful. */ else if (sig_status < 0) { retr_err = METALINK_SIG_ERROR; logprintf (LOG_NOTQUIET, _("File %s retrieved but signature does not match. " "\n"), quote (mfile->name)); } #endif last_retr_err = retr_err == RETROK ? last_retr_err : retr_err; /* Remove the file if error encountered or if option specified. Note: the file has been downloaded using *_loop. Therefore, it is not necessary to keep the file for continuated download. */ if ((retr_err != RETROK || opt.delete_after) && filename != NULL && file_exists_p (filename)) { logprintf (LOG_VERBOSE, _("Removing %s.\n"), quote (filename)); if (unlink (filename)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); } if (output_stream) { fclose (output_stream); output_stream = NULL; } xfree (filename); } /* Iterate over files. */ /* Restore original values. */ opt.output_document = _output_document; output_stream_regular = _output_stream_regular; output_stream = _output_stream; return last_retr_err; }
struct urlpos * append_url (const char *link_uri, int position, int size, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; const char *base = ctx->base ? ctx->base : ctx->parent_base; struct url *url; struct iri *iri = iri_new (); set_uri_encoding (iri, opt.locale, true); iri->utf8_encode = true; if (!base) { DEBUGP (("%s: no base, merge will use \"%s\".\n", ctx->document_file, link_uri)); if (!link_has_scheme) { /* Base URL is unavailable, and the link does not have a location attached to it -- we have to give up. Since this can only happen when using `--force-html -i', print a warning. */ logprintf (LOG_NOTQUIET, _("%s: Cannot resolve incomplete link %s.\n"), ctx->document_file, link_uri); return NULL; } url = url_parse (link_uri, NULL, iri, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", ctx->document_file, link_uri)); return NULL; } } else { /* Merge BASE with LINK_URI, but also make sure the result is canonicalized, i.e. that "../" have been resolved. (parse_url will do that for us.) */ char *complete_uri = uri_merge (base, link_uri); DEBUGP (("%s: merge(%s, %s) -> %s\n", quotearg_n_style (0, escape_quoting_style, ctx->document_file), quote_n (1, base), quote_n (2, link_uri), quotearg_n_style (3, escape_quoting_style, complete_uri))); url = url_parse (complete_uri, NULL, iri, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", ctx->document_file, complete_uri)); xfree (complete_uri); return NULL; } xfree (complete_uri); } iri_free (iri); DEBUGP (("appending %s to urlpos.\n", quote (url->url))); newel = xnew0 (struct urlpos); newel->url = url; newel->pos = position; newel->size = size; /* A URL is relative if the host is not named, and the name does not start with `/'. */ if (!link_has_scheme && *link_uri != '/') newel->link_relative_p = 1; else if (link_has_scheme) newel->link_complete_p = 1; /* Append the new URL maintaining the order by position. */ if (ctx->head == NULL) ctx->head = newel; else { struct urlpos *it, *prev = NULL; it = ctx->head; while (it && position > it->pos) { prev = it; it = it->next; } newel->next = it; if (prev) prev->next = newel; else ctx->head = newel; } return newel; }