static void limit_bandwidth (wgint bytes, struct ptimer *timer) { double delta_t = ptimer_read (timer) - limit_data.chunk_start; double expected; limit_data.chunk_bytes += bytes; /* Calculate the amount of time we expect downloading the chunk should take. If in reality it took less time, sleep to compensate for the difference. */ expected = (double) limit_data.chunk_bytes / opt.limit_rate; if (expected > delta_t) { double slp = expected - delta_t + limit_data.sleep_adjust; double t0, t1; if (slp < 0.2) { DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n", slp * 1000, number_to_static_string (limit_data.chunk_bytes), delta_t)); return; } if (slp > 1) // VisualWget: We do not expect to sleep more than 1 second. slp = 1; DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n", slp * 1000, number_to_static_string (limit_data.chunk_bytes), limit_data.sleep_adjust)); t0 = ptimer_read (timer); xsleep (slp); t1 = ptimer_measure (timer); /* Due to scheduling, we probably slept slightly longer (or shorter) than desired. Calculate the difference between the desired and the actual sleep, and adjust the next sleep by that amount. */ limit_data.sleep_adjust = slp - (t1 - t0); /* If sleep_adjust is very large, it's likely due to suspension and not clock inaccuracy. Don't enforce those. */ if (limit_data.sleep_adjust > 0.5) limit_data.sleep_adjust = 0.5; else if (limit_data.sleep_adjust < -0.5) limit_data.sleep_adjust = -0.5; } limit_data.chunk_bytes = 0; limit_data.chunk_start = ptimer_read (timer); }
struct urlpos * get_urls_css_file (const char *file, const char *url) { struct file_memory *fm; struct map_context ctx; /* Load the file. */ fm = read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = ctx.tail = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; ctx.nofollow = 0; get_urls_css (&ctx, 0, fm->length); read_file_free (fm); return ctx.head; }
struct urlpos * get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, struct iri *iri) { struct file_memory *fm; struct map_context ctx; int flags; /* Load the file. */ fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; ctx.nofollow = false; if (!interesting_tags) init_interesting (); /* Specify MHT_TRIM_VALUES because of buggy HTML generators that generate <a href=" foo"> instead of <a href="foo"> (browsers ignore spaces as well.) If you really mean space, use &32; or %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, e.g. in <img src="foo.[newline]html">. Such newlines are also ignored by IE and Mozilla and are presumably introduced by writing HTML with editors that force word wrap. */ flags = MHT_TRIM_VALUES; if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; /* the NULL here used to be interesting_tags */ map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); /* If meta charset isn't null, override content encoding */ if (iri && meta_charset) set_content_encoding (iri, meta_charset); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; xfree_null (ctx.base); wget_read_file_free (fm); return ctx.head; }
/* Convert the Un*x-ish style directory listing stored in FILE to a linked list of fileinfo (system-independent) entries. The contents of FILE are considered to be produced by the standard Unix `ls -la' output (whatever that might be). BSD (no group) and SYSV (with group) listings are handled. The time stamps are stored in a separate variable, time_t compatible (I hope). The timezones are ignored. */ static struct fileinfo * ftp_parse_unix_ls (const char *file, int ignore_perms) { FILE *fp; static const char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; int next, len, i, error, ignore; int year, month, day; /* for time analysis */ int hour, min, sec, ptype; struct tm timestruct, *tnow; time_t timenow; size_t bufsize = 0; char *line = NULL, *tok, *ptok; /* tokenizer */ struct fileinfo *dir, *l, cur; /* list creation */ fp = fopen (file, "rb"); if (!fp) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } dir = l = NULL; /* Line loop to end of file: */ while ((len = getline (&line, &bufsize, fp)) > 0) { len = clean_line (line, len); /* Skip if total... */ if (!strncasecmp (line, "total", 5)) continue; /* Get the first token (permissions). */ tok = strtok (line, " "); if (!tok) continue; cur.name = NULL; cur.linkto = NULL; /* Decide whether we deal with a file or a directory. */ switch (*tok) { case '-': cur.type = FT_PLAINFILE; DEBUGP (("PLAINFILE; ")); break; case 'd': cur.type = FT_DIRECTORY; DEBUGP (("DIRECTORY; ")); break; case 'l': cur.type = FT_SYMLINK; DEBUGP (("SYMLINK; ")); break; default: cur.type = FT_UNKNOWN; DEBUGP (("UNKNOWN; ")); break; } if (ignore_perms) { switch (cur.type) { case FT_PLAINFILE: cur.perms = 0644; break; case FT_DIRECTORY: cur.perms = 0755; break; default: /*cur.perms = 1023;*/ /* #### What is this? --hniksic */ cur.perms = 0644; } DEBUGP (("implicit perms %0o; ", cur.perms)); } else { cur.perms = symperms (tok + 1); DEBUGP (("perms %0o; ", cur.perms)); } error = ignore = 0; /* Erroneous and ignoring entries are treated equally for now. */ year = hour = min = sec = 0; /* Silence the compiler. */ month = day = 0; ptype = TT_DAY; next = -1; /* While there are tokens on the line, parse them. Next is the number of tokens left until the filename. Use the month-name token as the "anchor" (the place where the position wrt the file name is "known"). When a month name is encountered, `next' is set to 5. Also, the preceding characters are parsed to get the file size. This tactic is quite dubious when it comes to internationalization issues (non-English month names), but it works for now. */ tok = line; while (ptok = tok, (tok = strtok (NULL, " ")) != NULL) { --next; if (next < 0) /* a month name was not encountered */ { for (i = 0; i < 12; i++) if (!strcasecmp (tok, months[i])) break; /* If we got a month, it means the token before it is the size, and the filename is three tokens away. */ if (i != 12) { wgint size; /* Parse the previous token with str_to_wgint. */ if (ptok == line) { /* Something has gone wrong during parsing. */ error = 1; break; } errno = 0; size = str_to_wgint (ptok, NULL, 10); if (size == WGINT_MAX && errno == ERANGE) /* Out of range -- ignore the size. #### Should we refuse to start the download. */ cur.size = 0; else cur.size = size; DEBUGP (("size: %s; ", number_to_static_string(cur.size))); month = i; next = 5; DEBUGP (("month: %s; ", months[month])); } } else if (next == 4) /* days */ { if (tok[1]) /* two-digit... */ day = 10 * (*tok - '0') + tok[1] - '0'; else /* ...or one-digit */ day = *tok - '0'; DEBUGP (("day: %d; ", day)); } else if (next == 3) { /* This ought to be either the time, or the year. Let's be flexible! If we have a number x, it's a year. If we have x:y, it's hours and minutes. If we have x:y:z, z are seconds. */ year = 0; min = hour = sec = 0; /* We must deal with digits. */ if (c_isdigit (*tok)) { /* Suppose it's year. */ for (; c_isdigit (*tok); tok++) year = (*tok - '0') + 10 * year; if (*tok == ':') { /* This means these were hours! */ hour = year; year = 0; ptype = TT_HOUR_MIN; ++tok; /* Get the minutes... */ for (; c_isdigit (*tok); tok++) min = (*tok - '0') + 10 * min; if (*tok == ':') { /* ...and the seconds. */ ++tok; for (; c_isdigit (*tok); tok++) sec = (*tok - '0') + 10 * sec; } } } if (year) DEBUGP (("year: %d (no tm); ", year)); else DEBUGP (("time: %02d:%02d:%02d (no yr); ", hour, min, sec)); } else if (next == 2) /* The file name */ { int fnlen; char *p; /* Since the file name may contain a SPC, it is possible for strtok to handle it wrong. */ fnlen = strlen (tok); if (fnlen < len - (tok - line)) { /* So we have a SPC in the file name. Restore the original. */ tok[fnlen] = ' '; /* If the file is a symbolic link, it should have a ` -> ' somewhere. */ if (cur.type == FT_SYMLINK) { p = strstr (tok, " -> "); if (!p) { error = 1; break; } cur.linkto = xstrdup (p + 4); DEBUGP (("link to: %s\n", cur.linkto)); /* And separate it from the file name. */ *p = '\0'; } } /* If we have the filename, add it to the list of files or directories. */ /* "." and ".." are an exception! */ if (!strcmp (tok, ".") || !strcmp (tok, "..")) { DEBUGP (("\nIgnoring `.' and `..'; ")); ignore = 1; break; } /* Some FTP sites choose to have ls -F as their default LIST output, which marks the symlinks with a trailing `@', directory names with a trailing `/' and executables with a trailing `*'. This is no problem unless encountering a symbolic link ending with `@', or an executable ending with `*' on a server without default -F output. I believe these cases are very rare. */ fnlen = strlen (tok); /* re-calculate `fnlen' */ cur.name = xmalloc (fnlen + 1); memcpy (cur.name, tok, fnlen + 1); if (fnlen) { if (cur.type == FT_DIRECTORY && cur.name[fnlen - 1] == '/') { cur.name[fnlen - 1] = '\0'; DEBUGP (("trailing `/' on dir.\n")); } else if (cur.type == FT_SYMLINK && cur.name[fnlen - 1] == '@') { cur.name[fnlen - 1] = '\0'; DEBUGP (("trailing `@' on link.\n")); } else if (cur.type == FT_PLAINFILE && (cur.perms & 0111) && cur.name[fnlen - 1] == '*') { cur.name[fnlen - 1] = '\0'; DEBUGP (("trailing `*' on exec.\n")); } } /* if (fnlen) */ else error = 1; break; } else abort (); } /* while */ if (!cur.name || (cur.type == FT_SYMLINK && !cur.linkto)) error = 1; DEBUGP (("%s\n", cur.name ? cur.name : "")); if (error || ignore) { DEBUGP (("Skipping.\n")); xfree_null (cur.name); xfree_null (cur.linkto); continue; } if (!dir) { l = dir = xnew (struct fileinfo); memcpy (l, &cur, sizeof (cur)); l->prev = l->next = NULL; } else {
struct urlpos * get_urls_file (const char *file) { struct file_memory *fm; struct urlpos *head, *tail; const char *text, *text_end; /* Load the file. */ fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; text_end = fm->content + fm->length; while (text < text_end) { int up_error_code; char *url_text; struct urlpos *entry; struct url *url; const char *line_beg = text; const char *line_end = memchr (text, '\n', text_end - text); if (!line_end) line_end = text_end; else ++line_end; text = line_end; /* Strip whitespace from the beginning and end of line. */ while (line_beg < line_end && c_isspace (*line_beg)) ++line_beg; while (line_end > line_beg && c_isspace (*(line_end - 1))) --line_end; if (line_beg == line_end) continue; /* The URL is in the [line_beg, line_end) region. */ /* We must copy the URL to a zero-terminated string, and we can't use alloca because we're in a loop. *sigh*. */ url_text = strdupdelim (line_beg, line_end); if (opt.base_href) { /* Merge opt.base_href with URL. */ char *merged = uri_merge (opt.base_href, url_text); xfree (url_text); url_text = merged; } char *new_url = rewrite_shorthand_url (url_text); if (new_url) { xfree (url_text); url_text = new_url; } url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { char *error = url_error (url_text, up_error_code); logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), file, url_text, error); xfree (url_text); xfree (error); inform_exit_status (URLERROR); continue; } xfree (url_text); entry = xnew0 (struct urlpos); entry->url = url; if (!head) head = entry; else tail->next = entry; tail = entry; } wget_read_file_free (fm); return head; }