bool is_robots_txt_url (const char *url) { char *robots_url = uri_merge (url, RES_SPECS_LOCATION); bool ret = are_urls_equal (url, robots_url); xfree (robots_url); return ret; }
bool res_retrieve_file (const char *url, char **file, struct iri *iri) { struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; int saved_sp_val = opt.spider, url_err; struct url * url_parsed; /* Copy server URI encoding for a possible IDNA transformation, no need to encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ set_uri_encoding (i, iri->uri_encoding, false); i->utf8_encode = false; logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; url_parsed = url_parse (robots_url, &url_err, i, true); if (!url_parsed) { char *error = url_error (robots_url, url_err); logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); xfree (error); err = URLERROR; } else { err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, false, i, false); url_free(url_parsed); } opt.timestamping = saved_ts_val; opt.spider = saved_sp_val; xfree (robots_url); iri_free (i); if (err != RETROK && *file != NULL) { /* If the file is not retrieved correctly, but retrieve_url allocated the file name, deallocate is here so that the caller doesn't have to worry about it. */ xfree (*file); *file = NULL; } return err == RETROK; }
/* Construct and return a "transparent proxy" URL reflecting changes made by --adjust-extension to the file component (i.e., "basename") of the original URL, but leaving the "dirname" of the URL (protocol://hostname... portion) untouched. Think: populating a squid cache via a recursive wget scrape, where changing URLs to work locally with "file://..." is NOT desirable. Example: if p = "//foo.com/bar.cgi?xyz" and link->local_name = "docroot/foo.com/bar.cgi?xyz.css" then new_construct_func(p, link); will return "//foo.com/bar.cgi?xyz.css" Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/ */ static char * convert_basename (const char *p, const struct urlpos *link) { int len = link->size; char *url = NULL; char *org_basename = NULL, *local_basename = NULL; char *result = NULL; if (*p == '"' || *p == '\'') { len -= 2; p++; } url = xstrndup (p, len); org_basename = strrchr (url, '/'); if (org_basename) org_basename++; else org_basename = url; local_basename = strrchr (link->local_name, '/'); if (local_basename) local_basename++; else local_basename = url; /* * If the basenames differ, graft the adjusted basename (local_basename) * onto the original URL. */ if (strcmp (org_basename, local_basename) == 0) result = url; else { result = uri_merge (url, local_basename); xfree (url); } return result; }
int res_retrieve_file (const char *url, char **file) { uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; err = retrieve_url (robots_url, file, NULL, NULL, NULL); xfree (robots_url); if (err != RETROK && *file != NULL) { /* If the file is not retrieved correctly, but retrieve_url allocated the file name, deallocate is here so that the caller doesn't have to worry about it. */ xfree (*file); *file = NULL; } return err == RETROK; }
static void tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) { struct urlpos *base_urlpos; int attrind; char *newbase = find_attr (tag, "href", &attrind); if (!newbase) return; base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx), ATTR_SIZE(tag,attrind), ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; base_urlpos->link_base_p = 1; if (ctx->base) xfree (ctx->base); if (ctx->parent_base) ctx->base = uri_merge (ctx->parent_base, newbase); else ctx->base = xstrdup (newbase); }
uerr_t retrieve_url (struct url * orig_parsed, const char *origurl, char **file, char **newloc, const char *refurl, int *dt, bool recursive, struct iri *iri, bool register_status) { uerr_t result; char *url; bool location_changed; bool iri_fallbacked = 0; int dummy; char *mynewloc, *proxy; struct url *u = orig_parsed, *proxy_url; int up_error_code; /* url parse error code */ char *local_file; int redirection_count = 0; bool post_data_suspended = false; char *saved_post_data = NULL; char *saved_post_file_name = NULL; /* If dt is NULL, use local storage. */ if (!dt) { dt = &dummy; dummy = 0; } url = xstrdup (origurl); if (newloc) *newloc = NULL; if (file) *file = NULL; if (!refurl) refurl = opt.referer; redirected: /* (also for IRI fallbacking) */ result = NOCONERROR; mynewloc = NULL; local_file = NULL; proxy_url = NULL; proxy = getproxy (u); if (proxy) { struct iri *pi = iri_new (); set_uri_encoding (pi, opt.locale, true); pi->utf8_encode = false; /* Parse the proxy URL. */ proxy_url = url_parse (proxy, &up_error_code, NULL, true); if (!proxy_url) { char *error = url_error (proxy, up_error_code); logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), proxy, error); xfree (url); xfree (error); RESTORE_POST_DATA; result = PROXERR; goto bail; } if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme) { logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy); url_free (proxy_url); xfree (url); RESTORE_POST_DATA; result = PROXERR; goto bail; } } if (u->scheme == SCHEME_HTTP #ifdef HAVE_SSL || u->scheme == SCHEME_HTTPS #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { /* If this is a redirection, temporarily turn off opt.ftp_glob and opt.recursive, both being undesirable when following redirects. */ bool oldrec = recursive, glob = opt.ftp_glob; if (redirection_count) oldrec = glob = false; result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob); recursive = oldrec; /* There is a possibility of having HTTP being redirected to FTP. In these cases we must decide whether the text is HTML according to the suffix. The HTML suffixes are `.html', `.htm' and a few others, case-insensitive. */ if (redirection_count && local_file && u->scheme == SCHEME_FTP) { if (has_html_suffix_p (local_file)) *dt |= TEXTHTML; } } if (proxy_url) { url_free (proxy_url); proxy_url = NULL; } location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST); if (location_changed) { char *construced_newloc; struct url *newloc_parsed; assert (mynewloc != NULL); if (local_file) xfree (local_file); /* The HTTP specs only allow absolute URLs to appear in redirects, but a ton of boneheaded webservers and CGIs out there break the rules and use relative URLs, and popular browsers are lenient about this, so wget should be too. */ construced_newloc = uri_merge (url, mynewloc); xfree (mynewloc); mynewloc = construced_newloc; /* Reset UTF-8 encoding state, keep the URI encoding and reset the content encoding. */ iri->utf8_encode = opt.enable_iri; set_content_encoding (iri, NULL); xfree_null (iri->orig_url); iri->orig_url = NULL; /* Now, see if this new location makes sense. */ newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true); if (!newloc_parsed) { char *error = url_error (mynewloc, up_error_code); logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), error); if (orig_parsed != u) { url_free (u); } xfree (url); xfree (mynewloc); xfree (error); RESTORE_POST_DATA; goto bail; } /* Now mynewloc will become newloc_parsed->url, because if the Location contained relative paths like .././something, we don't want that propagating as url. */ xfree (mynewloc); mynewloc = xstrdup (newloc_parsed->url); /* Check for max. number of redirections. */ if (++redirection_count > opt.max_redirect) { logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"), opt.max_redirect); url_free (newloc_parsed); if (orig_parsed != u) { url_free (u); } xfree (url); xfree (mynewloc); RESTORE_POST_DATA; result = WRONGCODE; goto bail; } xfree (url); url = mynewloc; if (orig_parsed != u) { url_free (u); } u = newloc_parsed; /* If we're being redirected from POST, and we received a redirect code different than 307, we don't want to POST again. Many requests answer POST with a redirection to an index page; that redirection is clearly a GET. We "suspend" POST data for the duration of the redirections, and restore it when we're done. RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect specifically to preserve the method of the request. */ if (result != NEWLOCATION_KEEP_POST && !post_data_suspended) SUSPEND_POST_DATA; goto redirected; } /* Try to not encode in UTF-8 if fetching failed */ if (!(*dt & RETROKF) && iri->utf8_encode) { iri->utf8_encode = false; if (orig_parsed != u) { url_free (u); } u = url_parse (origurl, NULL, iri, true); if (u) { DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url))); url = xstrdup (u->url); iri_fallbacked = 1; goto redirected; } else DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url))); } if (local_file && u && *dt & RETROKF) { register_download (u->url, local_file); if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url)) register_redirection (origurl, u->url); if (*dt & TEXTHTML) register_html (local_file); if (*dt & TEXTCSS) register_css (local_file); } if (file) *file = local_file ? local_file : NULL; else xfree_null (local_file); if (orig_parsed != u) { url_free (u); } if (redirection_count || iri_fallbacked) { if (newloc) *newloc = url; else xfree (url); } else { if (newloc) *newloc = NULL; xfree (url); } RESTORE_POST_DATA; bail: if (register_status) inform_exit_status (result); return result; }
struct urlpos * get_urls_file (const char *file) { struct file_memory *fm; struct urlpos *head, *tail; const char *text, *text_end; /* Load the file. */ fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; text_end = fm->content + fm->length; while (text < text_end) { int up_error_code; char *url_text; struct urlpos *entry; struct url *url; const char *line_beg = text; const char *line_end = memchr (text, '\n', text_end - text); if (!line_end) line_end = text_end; else ++line_end; text = line_end; /* Strip whitespace from the beginning and end of line. */ while (line_beg < line_end && c_isspace (*line_beg)) ++line_beg; while (line_end > line_beg && c_isspace (*(line_end - 1))) --line_end; if (line_beg == line_end) continue; /* The URL is in the [line_beg, line_end) region. */ /* We must copy the URL to a zero-terminated string, and we can't use alloca because we're in a loop. *sigh*. */ url_text = strdupdelim (line_beg, line_end); if (opt.base_href) { /* Merge opt.base_href with URL. */ char *merged = uri_merge (opt.base_href, url_text); xfree (url_text); url_text = merged; } char *new_url = rewrite_shorthand_url (url_text); if (new_url) { xfree (url_text); url_text = new_url; } url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { char *error = url_error (url_text, up_error_code); logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), file, url_text, error); xfree (url_text); xfree (error); inform_exit_status (URLERROR); continue; } xfree (url_text); entry = xnew0 (struct urlpos); entry->url = url; if (!head) head = entry; else tail->next = entry; tail = entry; } wget_read_file_free (fm); return head; }
struct urlpos * append_url (const char *link_uri, int position, int size, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; const char *base = ctx->base ? ctx->base : ctx->parent_base; struct url *url; struct iri *iri = iri_new (); set_uri_encoding (iri, opt.locale, true); iri->utf8_encode = true; if (!base) { DEBUGP (("%s: no base, merge will use \"%s\".\n", ctx->document_file, link_uri)); if (!link_has_scheme) { /* Base URL is unavailable, and the link does not have a location attached to it -- we have to give up. Since this can only happen when using `--force-html -i', print a warning. */ logprintf (LOG_NOTQUIET, _("%s: Cannot resolve incomplete link %s.\n"), ctx->document_file, link_uri); return NULL; } url = url_parse (link_uri, NULL, iri, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", ctx->document_file, link_uri)); return NULL; } } else { /* Merge BASE with LINK_URI, but also make sure the result is canonicalized, i.e. that "../" have been resolved. (parse_url will do that for us.) */ char *complete_uri = uri_merge (base, link_uri); DEBUGP (("%s: merge(%s, %s) -> %s\n", quotearg_n_style (0, escape_quoting_style, ctx->document_file), quote_n (1, base), quote_n (2, link_uri), quotearg_n_style (3, escape_quoting_style, complete_uri))); url = url_parse (complete_uri, NULL, iri, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", ctx->document_file, complete_uri)); xfree (complete_uri); return NULL; } xfree (complete_uri); } iri_free (iri); DEBUGP (("appending %s to urlpos.\n", quote (url->url))); newel = xnew0 (struct urlpos); newel->url = url; newel->pos = position; newel->size = size; /* A URL is relative if the host is not named, and the name does not start with `/'. */ if (!link_has_scheme && *link_uri != '/') newel->link_relative_p = 1; else if (link_has_scheme) newel->link_complete_p = 1; /* Append the new URL maintaining the order by position. */ if (ctx->head == NULL) ctx->head = newel; else { struct urlpos *it, *prev = NULL; it = ctx->head; while (it && position > it->pos) { prev = it; it = it->next; } newel->next = it; if (prev) prev->next = newel; else ctx->head = newel; } return newel; }
uerr_t retrieve_url (const char *origurl, char **file, char **newloc, const char *refurl, int *dt, bool recursive) { uerr_t result; char *url; bool location_changed; int dummy; char *mynewloc, *proxy; struct url *u, *proxy_url; int up_error_code; /* url parse error code */ char *local_file; int redirection_count = 0; bool post_data_suspended = false; char *saved_post_data = NULL; char *saved_post_file_name = NULL; /* If dt is NULL, use local storage. */ if (!dt) { dt = &dummy; dummy = 0; } url = xstrdup (origurl); if (newloc) *newloc = NULL; if (file) *file = NULL; u = url_parse (url, &up_error_code); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); xfree (url); return URLERROR; } if (!refurl) refurl = opt.referer; redirected: result = NOCONERROR; mynewloc = NULL; local_file = NULL; proxy_url = NULL; proxy = getproxy (u); if (proxy) { /* Parse the proxy URL. */ proxy_url = url_parse (proxy, &up_error_code); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), proxy, url_error (up_error_code)); xfree (url); RESTORE_POST_DATA; return PROXERR; } if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme) { logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy); url_free (proxy_url); xfree (url); RESTORE_POST_DATA; return PROXERR; } } if (u->scheme == SCHEME_HTTP #ifdef HAVE_SSL || u->scheme == SCHEME_HTTPS #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); } else if (u->scheme == SCHEME_FTP) { /* If this is a redirection, temporarily turn off opt.ftp_glob and opt.recursive, both being undesirable when following redirects. */ bool oldrec = recursive, glob = opt.ftp_glob; if (redirection_count) oldrec = glob = false; result = ftp_loop (u, dt, proxy_url, recursive, glob); recursive = oldrec; /* There is a possibility of having HTTP being redirected to FTP. In these cases we must decide whether the text is HTML according to the suffix. The HTML suffixes are `.html', `.htm' and a few others, case-insensitive. */ if (redirection_count && local_file && u->scheme == SCHEME_FTP) { if (has_html_suffix_p (local_file)) *dt |= TEXTHTML; } } if (proxy_url) { url_free (proxy_url); proxy_url = NULL; } location_changed = (result == NEWLOCATION); if (location_changed) { char *construced_newloc; struct url *newloc_parsed; assert (mynewloc != NULL); if (local_file) xfree (local_file); /* The HTTP specs only allow absolute URLs to appear in redirects, but a ton of boneheaded webservers and CGIs out there break the rules and use relative URLs, and popular browsers are lenient about this, so wget should be too. */ construced_newloc = uri_merge (url, mynewloc); xfree (mynewloc); mynewloc = construced_newloc; /* Now, see if this new location makes sense. */ newloc_parsed = url_parse (mynewloc, &up_error_code); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), url_error (up_error_code)); url_free (u); xfree (url); xfree (mynewloc); RESTORE_POST_DATA; return result; } /* Now mynewloc will become newloc_parsed->url, because if the Location contained relative paths like .././something, we don't want that propagating as url. */ xfree (mynewloc); mynewloc = xstrdup (newloc_parsed->url); /* Check for max. number of redirections. */ if (++redirection_count > opt.max_redirect) { logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"), opt.max_redirect); url_free (newloc_parsed); url_free (u); xfree (url); xfree (mynewloc); RESTORE_POST_DATA; return WRONGCODE; } xfree (url); url = mynewloc; url_free (u); u = newloc_parsed; /* If we're being redirected from POST, we don't want to POST again. Many requests answer POST with a redirection to an index page; that redirection is clearly a GET. We "suspend" POST data for the duration of the redirections, and restore it when we're done. */ if (!post_data_suspended) SUSPEND_POST_DATA; goto redirected; } if (local_file) { if (*dt & RETROKF) { register_download (u->url, local_file); if (redirection_count && 0 != strcmp (origurl, u->url)) register_redirection (origurl, u->url); if (*dt & TEXTHTML) register_html (u->url, local_file); } } if (file) *file = local_file ? local_file : NULL; else xfree_null (local_file); url_free (u); if (redirection_count) { if (newloc) *newloc = url; else xfree (url); } else { if (newloc) *newloc = NULL; xfree (url); } RESTORE_POST_DATA; return result; }