void register_download (const char *url, const char *file) { if (!opt.convert_links) return; if (!dl_file_url_map) dl_file_url_map = make_string_hash_table (0); hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); if (!dl_url_file_map) dl_url_file_map = make_string_hash_table (0); hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); }
downloaded_file_t downloaded_file (downloaded_file_t mode, const char *file) { downloaded_file_t *ptr; if (mode == CHECK_FOR_FILE) { if (!downloaded_files_hash) return FILE_NOT_ALREADY_DOWNLOADED; ptr = hash_table_get (downloaded_files_hash, file); if (!ptr) return FILE_NOT_ALREADY_DOWNLOADED; return *ptr; } if (!downloaded_files_hash) downloaded_files_hash = make_string_hash_table (0); ptr = hash_table_get (downloaded_files_hash, file); if (ptr) return *ptr; ptr = downloaded_mode_to_ptr (mode); hash_table_put (downloaded_files_hash, xstrdup (file), ptr); return FILE_NOT_ALREADY_DOWNLOADED; }
void register_css (const char *file) { if (!downloaded_css_set) downloaded_css_set = make_string_hash_table (0); string_set_add (downloaded_css_set, file); }
int main (void) { struct hash_table *ht = make_string_hash_table (0); char line[80]; while ((fgets (line, sizeof (line), stdin))) { int len = strlen (line); if (len <= 1) continue; line[--len] = '\0'; if (!hash_table_contains (ht, line)) hash_table_put (ht, strdup (line), "here I am!"); #if 1 if (len % 5 == 0) { char *line_copy; if (hash_table_get_pair (ht, line, &line_copy, NULL)) { hash_table_remove (ht, line); xfree (line_copy); } } #endif } #if 0 print_hash (ht); #endif #if 1 printf ("%d %d\n", ht->count, ht->size); #endif return 0; }
void register_html (const char *url, const char *file) { if (!downloaded_html_set) downloaded_html_set = make_string_hash_table (0); string_set_add (downloaded_html_set, file); }
/* Remembers broken links. */ void nonexisting_url (const char *url) { /* Ignore robots.txt URLs */ if (is_robots_txt_url (url)) return; if (!nonexisting_urls_set) nonexisting_urls_set = make_string_hash_table (0); string_set_add (nonexisting_urls_set, url); }
void register_html (const char *url, const char *file) { if (!downloaded_html_set) downloaded_html_set = make_string_hash_table (0); else if (hash_table_contains (downloaded_html_set, file)) return; /* The set and the list should use the same copy of FILE, but the slist interface insists on strduping the string it gets. Oh well. */ string_set_add (downloaded_html_set, file); downloaded_html_list = slist_prepend (downloaded_html_list, file); }
static void write_backup_file (const char *file, downloaded_file_t downloaded_file_return) { /* Rather than just writing over the original .html file with the converted version, save the former to *.orig. Note we only do this for files we've _successfully_ downloaded, so we don't clobber .orig files sitting around from previous invocations. On VMS, use "_orig" instead of ".orig". See "wget.h". */ /* Construct the backup filename as the original name plus ".orig". */ size_t filename_len = strlen (file); char* filename_plus_orig_suffix; /* TODO: hack this to work with css files */ if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) { /* Just write "orig" over "html". We need to do it this way because when we're checking to see if we've downloaded the file before (to see if we can skip downloading it), we don't know if it's a text/html file. Therefore we don't know yet at that stage that -E is going to cause us to tack on ".html", so we need to compare vs. the original URL plus ".orig", not the original URL plus ".html.orig". */ filename_plus_orig_suffix = alloca (filename_len + 1); strcpy (filename_plus_orig_suffix, file); strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig"); } else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */ { /* Append ".orig" to the name. */ filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX)); strcpy (filename_plus_orig_suffix, file); strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX); } if (!converted_files) converted_files = make_string_hash_table (0); /* We can get called twice on the same URL thanks to the convert_all_links() call in main. If we write the .orig file each time in such a case, it'll end up containing the first-pass conversion, not the original file. So, see if we've already been called on this file. */ if (!string_set_contains (converted_files, file)) { /* Rename <file> to <file>.orig before former gets written over. */ if (rename (file, filename_plus_orig_suffix) != 0) logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), file, filename_plus_orig_suffix, strerror (errno)); /* Remember that we've already written a .orig backup for this file. Note that we never free this memory since we need it till the convert_all_links() call, which is one of the last things the program does before terminating. BTW, I'm not sure if it would be safe to just set 'converted_file_ptr->string' to 'file' below, rather than making a copy of the string... Another note is that I thought I could just add a field to the urlpos structure saying that we'd written a .orig file for this URL, but that didn't work, so I had to make this separate list. -- Dan Harkless <*****@*****.**> This [adding a field to the urlpos structure] didn't work because convert_file() is called from convert_all_links at the end of the retrieval with a freshly built new urlpos list. -- Hrvoje Niksic <*****@*****.**> */ string_set_add (converted_files, file); } }
uerr_t retrieve_tree (const char *start_url) { uerr_t status = RETROK; /* The queue of URLs we need to load. */ struct url_queue *queue; /* The URLs we do not wish to enqueue, because they are already in the queue, but haven't been downloaded yet. */ struct hash_table *blacklist; int up_error_code; struct url *start_url_parsed = url_parse (start_url, &up_error_code); if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, url_error (up_error_code)); return URLERROR; } queue = url_queue_new (); blacklist = make_string_hash_table (0); /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1); string_set_add (blacklist, start_url_parsed->url); while (1) { int descend = 0; char *url, *referer, *file = NULL; int depth, html_allowed; boolean dash_p_leaf_HTML = FALSE; if (opt.quota && total_downloaded_bytes > opt.quota) break; if (status == FWRITEERR) break; /* Get the next URL from the queue... */ if (!url_dequeue (queue, (const char **)&url, (const char **)&referer, &depth, &html_allowed)) break; /* ...and download it. Note that this download is in most cases unconditional, as download_child_p already makes sure a file doesn't get enqueued twice -- and yet this check is here, and not in download_child_p. This is so that if you run `wget -r URL1 URL2', and a random URL is encountered once under URL1 and again under URL2, but at a different (possibly smaller) depth, we want the URL's children to be taken into account the second time. */ if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) { file = xstrdup (hash_table_get (dl_url_file_map, url)); DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", url, file)); if (html_allowed && downloaded_html_set && string_set_contains (downloaded_html_set, file)) descend = 1; } else { int dt = 0; char *redirected = NULL; int oldrec = opt.recursive; opt.recursive = 0; status = retrieve_url (url, &file, &redirected, referer, &dt); opt.recursive = oldrec; if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) descend = 1; if (redirected) { /* We have been redirected, possibly to another host, or different path, or wherever. Check whether we really want to follow it. */ if (descend) { if (!descend_redirect_p (redirected, url, depth, start_url_parsed, blacklist)) descend = 0; else /* Make sure that the old pre-redirect form gets blacklisted. */ string_set_add (blacklist, url); } xfree (url); url = redirected; } } if (descend && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) { if (opt.page_requisites && (depth == opt.reclevel || depth == opt.reclevel + 1)) { /* When -p is specified, we are allowed to exceed the maximum depth, but only for the "inline" links, i.e. those that are needed to display the page. Originally this could exceed the depth at most by one, but we allow one more level so that the leaf pages that contain frames can be loaded correctly. */ dash_p_leaf_HTML = TRUE; } else { /* Either -p wasn't specified or it was and we've already spent the two extra (pseudo-)levels that it affords us, so we need to bail out. */ DEBUGP (("Not descending further; at depth %d, max. %d.\n", depth, opt.reclevel)); descend = 0; } } /* If the downloaded document was HTML, parse it and enqueue the links it contains. */ if (descend) { int meta_disallow_follow = 0; struct urlpos *children = get_urls_html (file, url, &meta_disallow_follow); if (opt.use_robots && meta_disallow_follow) { free_urlpos (children); children = NULL; } if (children) { struct urlpos *child = children; struct url *url_parsed = url_parsed = url_parse (url, NULL); assert (url_parsed != NULL); for (; child; child = child->next) { if (child->ignore_when_downloading) continue; if (dash_p_leaf_HTML && !child->link_inline_p) continue; if (download_child_p (child, url_parsed, depth, start_url_parsed, blacklist)) { url_enqueue (queue, xstrdup (child->url->url), xstrdup (url), depth + 1, child->link_expect_html); /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ string_set_add (blacklist, child->url->url); } } url_free (url_parsed); free_urlpos (children); } } if (opt.delete_after || (file && !acceptable (file))) { /* Either --delete-after was specified, or we loaded this otherwise rejected (e.g. by -R) HTML file just so we could harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : "recursive rejection criteria")); logprintf (LOG_VERBOSE, (opt.delete_after ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), file); if (unlink (file)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); register_delete_file (file); } xfree (url); FREE_MAYBE (referer); FREE_MAYBE (file); } /* If anything is left of the queue due to a premature exit, free it now. */ { char *d1, *d2; int d3, d4; while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3, &d4)) { xfree (d1); FREE_MAYBE (d2); } } url_queue_delete (queue); if (start_url_parsed) url_free (start_url_parsed); string_set_free (blacklist); if (opt.quota && total_downloaded_bytes > opt.quota) return QUOTEXC; else if (status == FWRITEERR) return FWRITEERR; else return RETROK; }
/* The core of recursive retrieving. Endless recursion is avoided by having all URLs stored to a linked list of URLs, which is checked before loading any URL. That way no URL can get loaded twice. The function also supports specification of maximum recursion depth and a number of other goodies. */ uerr_t recursive_retrieve (const char *file, const char *this_url) { char *constr, *filename, *newloc; char *canon_this_url = NULL; int dt, inl, dash_p_leaf_HTML = FALSE; int meta_disallow_follow; int this_url_ftp; /* See below the explanation */ uerr_t err; struct urlinfo *rurl; urlpos *url_list, *cur_url; char *rfile; /* For robots */ struct urlinfo *u; assert (this_url != NULL); assert (file != NULL); /* If quota was exceeded earlier, bail out. */ if (downloaded_exceeds_quota ()) return QUOTEXC; /* Cache the current URL in the list. */ if (first_time) { /* These three operations need to be done only once per Wget run. They should probably be at a different location. */ if (!undesirable_urls) undesirable_urls = make_string_hash_table (0); hash_table_clear (undesirable_urls); string_set_add (undesirable_urls, this_url); /* Enter this_url to the hash table, in original and "enhanced" form. */ u = newurl (); err = parseurl (this_url, u, 0); if (err == URLOK) { string_set_add (undesirable_urls, u->url); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This problem exists only when running the first time. */ canon_this_url = xstrdup (u->url); } else { DEBUGP (("Double yuck! The *base* URL is broken.\n")); base_dir = NULL; } freeurl (u, 1); depth = 1; robots_host = NULL; forbidden = NULL; first_time = 0; } else ++depth; if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel) /* We've exceeded the maximum recursion depth specified by the user. */ { if (opt.page_requisites && depth <= opt.reclevel + 1) /* When -p is specified, we can do one more partial recursion from the "leaf nodes" on the HTML document tree. The recursion is partial in that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags except for <LINK REL="stylesheet">. */ dash_p_leaf_HTML = TRUE; else /* Either -p wasn't specified or it was and we've already gone the one extra (pseudo-)level that it affords us, so we need to bail out. */ { DEBUGP (("Recursion depth %d exceeded max. depth %d.\n", depth, opt.reclevel)); --depth; return RECLEVELEXC; } } /* Determine whether this_url is an FTP URL. If it is, it means that the retrieval is done through proxy. In that case, FTP links will be followed by default and recursion will not be turned off when following them. */ this_url_ftp = (urlproto (this_url) == URLFTP); /* Get the URL-s from an HTML file: */ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, dash_p_leaf_HTML, &meta_disallow_follow); if (opt.use_robots && meta_disallow_follow) { /* The META tag says we are not to follow this file. Respect that. */ free_urlpos (url_list); url_list = NULL; } /* Decide what to do with each of the URLs. A URL will be loaded if it meets several requirements, discussed later. */ for (cur_url = url_list; cur_url; cur_url = cur_url->next) { /* If quota was exceeded earlier, bail out. */ if (downloaded_exceeds_quota ()) break; /* Parse the URL for convenient use in other functions, as well as to get the optimized form. It also checks URL integrity. */ u = newurl (); if (parseurl (cur_url->url, u, 0) != URLOK) { DEBUGP (("Yuck! A bad URL.\n")); freeurl (u, 1); continue; } if (u->proto == URLFILE) { DEBUGP (("Nothing to do with file:// around here.\n")); freeurl (u, 1); continue; } assert (u->url != NULL); constr = xstrdup (u->url); /* Several checkings whether a file is acceptable to load: 1. check if URL is ftp, and we don't load it 2. check for relative links (if relative_only is set) 3. check for domain 4. check for no-parent 5. check for excludes && includes 6. check for suffix 7. check for same host (if spanhost is unset), with possible gethostbyname baggage 8. check for robots.txt Addendum: If the URL is FTP, and it is to be loaded, only the domain and suffix settings are "stronger". Note that .html and (yuck) .htm will get loaded regardless of suffix rules (but that is remedied later with unlink) unless the depth equals the maximum depth. More time- and memory- consuming tests should be put later on the list. */ /* inl is set if the URL we are working on (constr) is stored in undesirable_urls. Using it is crucial to avoid unnecessary repeated continuous hits to the hash table. */ inl = string_set_contains (undesirable_urls, constr); /* If it is FTP, and FTP is not followed, chuck it out. */ if (!inl) if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) { DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); string_set_add (undesirable_urls, constr); inl = 1; } /* If it is absolute link and they are not followed, chuck it out. */ if (!inl && u->proto != URLFTP) if (opt.relative_only && !cur_url->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); string_set_add (undesirable_urls, constr); inl = 1; } /* If its domain is not to be accepted/looked-up, chuck it out. */ if (!inl) if (!accept_domain (u)) { DEBUGP (("I don't like the smell of that domain.\n")); string_set_add (undesirable_urls, constr); inl = 1; } /* Check for parent directory. */ if (!inl && opt.no_parent /* If the new URL is FTP and the old was not, ignore opt.no_parent. */ && !(!this_url_ftp && u->proto == URLFTP)) { /* Check for base_dir first. */ if (!(base_dir && frontcmp (base_dir, u->dir))) { /* Failing that, check for parent dir. */ struct urlinfo *ut = newurl (); if (parseurl (this_url, ut, 0) != URLOK) DEBUGP (("Double yuck! The *base* URL is broken.\n")); else if (!frontcmp (ut->dir, u->dir)) { /* Failing that too, kill the URL. */ DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); string_set_add (undesirable_urls, constr); inl = 1; } freeurl (ut, 1); } } /* If the file does not match the acceptance list, or is on the rejection list, chuck it out. The same goes for the directory exclude- and include- lists. */ if (!inl && (opt.includes || opt.excludes)) { if (!accdir (u->dir, ALLABS)) { DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); string_set_add (undesirable_urls, constr); inl = 1; } } if (!inl) { char *suf = NULL; /* We check for acceptance/rejection rules only for non-HTML documents. Since we don't know whether they really are HTML, it will be deduced from (an OR-ed list): 1) u->file is "" (meaning it is a directory) 2) suffix exists, AND: a) it is "html", OR b) it is "htm" If the file *is* supposed to be HTML, it will *not* be subject to acc/rej rules, unless a finite maximum depth has been specified and the current depth is the maximum depth. */ if (! (!*u->file || (((suf = suffix (constr)) != NULL) && ((!strcmp (suf, "html") || !strcmp (suf, "htm")) && ((opt.reclevel != INFINITE_RECURSION) && (depth != opt.reclevel)))))) { if (!acceptable (u->file)) { DEBUGP (("%s (%s) does not match acc/rej rules.\n", constr, u->file)); string_set_add (undesirable_urls, constr); inl = 1; } } FREE_MAYBE (suf); } /* Optimize the URL (which includes possible DNS lookup) only after all other possibilities have been exhausted. */ if (!inl) { if (!opt.simple_check) opt_url (u); else { char *p; /* Just lowercase the hostname. */ for (p = u->host; *p; p++) *p = TOLOWER (*p); xfree (u->url); u->url = str_url (u, 0); } xfree (constr); constr = xstrdup (u->url); string_set_add (undesirable_urls, constr); if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) if (!opt.spanhost && this_url && !same_host (this_url, constr)) { DEBUGP (("This is not the same hostname as the parent's.\n")); string_set_add (undesirable_urls, constr); inl = 1; } } /* What about robots.txt? */ if (!inl && opt.use_robots && u->proto == URLHTTP) { /* Since Wget knows about only one set of robot rules at a time, /robots.txt must be reloaded whenever a new host is accessed. robots_host holds the host the current `forbid' variable is assigned to. */ if (!robots_host || !same_host (robots_host, u->host)) { FREE_MAYBE (robots_host); /* Now make robots_host the new host, no matter what the result will be. So if there is no /robots.txt on the site, Wget will not retry getting robots all the time. */ robots_host = xstrdup (u->host); free_vec (forbidden); forbidden = NULL; err = retrieve_robots (constr, ROBOTS_FILENAME); if (err == ROBOTSOK) { rurl = robots_url (constr, ROBOTS_FILENAME); rfile = url_filename (rurl); forbidden = parse_robots (rfile); freeurl (rurl, 1); xfree (rfile); } } /* Now that we have (or don't have) robots, we can check for them. */ if (!robots_match (u, forbidden)) { DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, ROBOTS_FILENAME)); string_set_add (undesirable_urls, constr); inl = 1; } } filename = NULL; /* If it wasn't chucked out, do something with it. */ if (!inl) { DEBUGP (("I've decided to load it -> ")); /* Add it to the list of already-loaded URL-s. */ string_set_add (undesirable_urls, constr); /* Automatically followed FTPs will *not* be downloaded recursively. */ if (u->proto == URLFTP) { /* Don't you adore side-effects? */ opt.recursive = 0; } /* Reset its type. */ dt = 0; /* Retrieve it. */ retrieve_url (constr, &filename, &newloc, canon_this_url ? canon_this_url : this_url, &dt); if (u->proto == URLFTP) { /* Restore... */ opt.recursive = 1; } if (newloc) { xfree (constr); constr = newloc; } /* If there was no error, and the type is text/html, parse it recursively. */ if (dt & TEXTHTML) { if (dt & RETROKF) recursive_retrieve (filename, constr); } else DEBUGP (("%s is not text/html so we don't chase.\n", filename ? filename: "(null)")); if (opt.delete_after || (filename && !acceptable (filename))) /* Either --delete-after was specified, or we loaded this otherwise rejected (e.g. by -R) HTML file just so we could harvest its hyperlinks -- in either case, delete the local file. */ { DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : "recursive rejection criteria")); logprintf (LOG_VERBOSE, (opt.delete_after ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), filename); if (unlink (filename)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); dt &= ~RETROKF; } /* If everything was OK, and links are to be converted, let's store the local filename. */ if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) { cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->local_name = xstrdup (filename); } } else DEBUGP (("%s already in list, so we don't load.\n", constr)); /* Free filename and constr. */ FREE_MAYBE (filename); FREE_MAYBE (constr); freeurl (u, 1); /* Increment the pbuf for the appropriate size. */ } if (opt.convert_links && !opt.delete_after) /* This is merely the first pass: the links that have been successfully downloaded are converted. In the second pass, convert_all_links() will also convert those links that have NOT been downloaded to their canonical form. */ convert_links (file, url_list); /* Free the linked list of URL-s. */ free_urlpos (url_list); /* Free the canonical this_url. */ FREE_MAYBE (canon_this_url); /* Decrement the recursion depth. */ --depth; if (downloaded_exceeds_quota ()) return QUOTEXC; else return RETROK; }