Ejemplo n.º 1
0
struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
               struct iri *iri)
{
  struct file_memory *fm;
  struct map_context ctx;
  int flags;

  /* Load the file. */
  fm = wget_read_file (file);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));

  ctx.text = fm->content;
  ctx.head = NULL;
  ctx.base = NULL;
  ctx.parent_base = url ? url : opt.base_href;
  ctx.document_file = file;
  ctx.nofollow = false;

  if (!interesting_tags)
    init_interesting ();

  /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
     generate <a href=" foo"> instead of <a href="foo"> (browsers
     ignore spaces as well.)  If you really mean space, use &32; or
     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
     e.g. in <img src="foo.[newline]html">.  Such newlines are also
     ignored by IE and Mozilla and are presumably introduced by
     writing HTML with editors that force word wrap.  */
  flags = MHT_TRIM_VALUES;
  if (opt.strict_comments)
    flags |= MHT_STRICT_COMMENTS;

  /* the NULL here used to be interesting_tags */
  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                 NULL, interesting_attributes);

  /* If meta charset isn't null, override content encoding */
  if (iri && meta_charset)
    set_content_encoding (iri, meta_charset);

  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
  if (meta_disallow_follow)
    *meta_disallow_follow = ctx.nofollow;

  xfree_null (ctx.base);
  wget_read_file_free (fm);
  return ctx.head;
}
Ejemplo n.º 2
0
struct robot_specs *
res_parse_from_file (const char *filename)
{
  struct robot_specs *specs;
  struct file_memory *fm = wget_read_file (filename);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
                 filename, strerror (errno));
      return NULL;
    }
  specs = res_parse (fm->content, fm->length);
  wget_read_file_free (fm);
  return specs;
}
Ejemplo n.º 3
0
/* Change the links in one file.  LINKS is a list of links in the
   document, along with their positions and the desired direction of
   the conversion.  */
static void
convert_links (const char *file, struct urlpos *links)
{
    struct file_memory *fm;
    FILE *fp;
    const char *p;
    downloaded_file_t downloaded_file_return;

    struct urlpos *link;
    int to_url_count = 0, to_file_count = 0;

    logprintf (LOG_VERBOSE, _("Converting %s... "), file);

    {
        /* First we do a "dry run": go through the list L and see whether
           any URL needs to be converted in the first place.  If not, just
           leave the file alone.  */
        int dry_count = 0;
        struct urlpos *dry;
        for (dry = links; dry; dry = dry->next)
            if (dry->convert != CO_NOCONVERT)
                ++dry_count;
        if (!dry_count)
        {
            logputs (LOG_VERBOSE, _("nothing to do.\n"));
            return;
        }
    }

    fm = wget_read_file (file);
    if (!fm)
    {
        logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
                   file, strerror (errno));
        return;
    }

    downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
    if (opt.backup_converted && downloaded_file_return)
        write_backup_file (file, downloaded_file_return);

    /* Before opening the file for writing, unlink the file.  This is
       important if the data in FM is mmaped.  In such case, nulling the
       file, which is what fopen() below does, would make us read all
       zeroes from the mmaped region.  */
    if (unlink (file) < 0 && errno != ENOENT)
    {
        logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
                   quote (file), strerror (errno));
        wget_read_file_free (fm);
        return;
    }
    /* Now open the file for writing.  */
    fp = fopen (file, "wb");
    if (!fp)
    {
        logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
                   file, strerror (errno));
        wget_read_file_free (fm);
        return;
    }

    /* Here we loop through all the URLs in file, replacing those of
       them that are downloaded with relative references.  */
    p = fm->content;
    for (link = links; link; link = link->next)
    {
        char *url_start = fm->content + link->pos;

        if (link->pos >= fm->length)
        {
            DEBUGP (("Something strange is going on.  Please investigate."));
            break;
        }
        /* If the URL is not to be converted, skip it.  */
        if (link->convert == CO_NOCONVERT)
        {
            DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
            continue;
        }

        /* Echo the file contents, up to the offending URL's opening
           quote, to the outfile.  */
        fwrite (p, 1, url_start - p, fp);
        p = url_start;

        switch (link->convert)
        {
        case CO_CONVERT_TO_RELATIVE:
            /* Convert absolute URL to relative. */
        {
            char *newname = construct_relative (file, link->local_name);
            char *quoted_newname = local_quote_string (newname,
                                   link->link_css_p);

            if (link->link_css_p)
                p = replace_plain (p, link->size, fp, quoted_newname);
            else if (!link->link_refresh_p)
                p = replace_attr (p, link->size, fp, quoted_newname);
            else
                p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
                                               link->refresh_timeout);

            DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
                     link->url->url, newname, link->pos, file));
            xfree (newname);
            xfree (quoted_newname);
            ++to_file_count;
            break;
        }
        case CO_CONVERT_TO_COMPLETE:
            /* Convert the link to absolute URL. */
        {
            char *newlink = link->url->url;
            char *quoted_newlink = html_quote_string (newlink);

            if (link->link_css_p)
                p = replace_plain (p, link->size, fp, newlink);
            else if (!link->link_refresh_p)
                p = replace_attr (p, link->size, fp, quoted_newlink);
            else
                p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
                                               link->refresh_timeout);

            DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
                     newlink, link->pos, file));
            xfree (quoted_newlink);
            ++to_url_count;
            break;
        }
        case CO_NULLIFY_BASE:
            /* Change the base href to "". */
            p = replace_attr (p, link->size, fp, "");
            break;
        case CO_NOCONVERT:
            abort ();
            break;
        }
    }

    /* Output the rest of the file. */
    if (p - fm->content < fm->length)
        fwrite (p, 1, fm->length - (p - fm->content), fp);
    fclose (fp);
    wget_read_file_free (fm);

    logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
}
Ejemplo n.º 4
0
struct urlpos *
get_urls_file (const char *file)
{
  struct file_memory *fm;
  struct urlpos *head, *tail;
  const char *text, *text_end;

  /* Load the file.  */
  fm = wget_read_file (file);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));

  head = tail = NULL;
  text = fm->content;
  text_end = fm->content + fm->length;
  while (text < text_end)
    {
      int up_error_code;
      char *url_text;
      struct urlpos *entry;
      struct url *url;

      const char *line_beg = text;
      const char *line_end = memchr (text, '\n', text_end - text);
      if (!line_end)
        line_end = text_end;
      else
        ++line_end;
      text = line_end;

      /* Strip whitespace from the beginning and end of line. */
      while (line_beg < line_end && c_isspace (*line_beg))
        ++line_beg;
      while (line_end > line_beg && c_isspace (*(line_end - 1)))
        --line_end;

      if (line_beg == line_end)
        continue;

      /* The URL is in the [line_beg, line_end) region. */

      /* We must copy the URL to a zero-terminated string, and we
         can't use alloca because we're in a loop.  *sigh*.  */
      url_text = strdupdelim (line_beg, line_end);

      if (opt.base_href)
        {
          /* Merge opt.base_href with URL. */
          char *merged = uri_merge (opt.base_href, url_text);
          xfree (url_text);
          url_text = merged;
        }

      char *new_url = rewrite_shorthand_url (url_text);
      if (new_url)
        {
          xfree (url_text);
          url_text = new_url;
        }

      url = url_parse (url_text, &up_error_code, NULL, false);
      if (!url)
        {
          char *error = url_error (url_text, up_error_code);
          logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
                     file, url_text, error);
          xfree (url_text);
          xfree (error);
          inform_exit_status (URLERROR);
          continue;
        }
      xfree (url_text);

      entry = xnew0 (struct urlpos);
      entry->url = url;

      if (!head)
        head = entry;
      else
        tail->next = entry;
      tail = entry;
    }
  wget_read_file_free (fm);
  return head;
}
Ejemplo n.º 5
0
static void html_parse_localfile(const char *fname)
{
	char *data;
	const char *encoding = NULL;
	size_t len;

	if ((data = wget_read_file(fname, &len))) {
		if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
			// Big-endian UTF-16
			encoding = "UTF-16BE";

			// adjust behind BOM, ignore trailing single byte
			data += 2;
			len -= 2;
		} else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
			// Little-endian UTF-16
			encoding = "UTF-16LE";

			// adjust behind BOM
			data += 2;
			len -= 2;
		} else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
			// UTF-8
			encoding = "UTF-8";

			// adjust behind BOM
			data += 3;
			len -= 3;
		}

		if (encoding)
			printf("URI encoding '%s' set by BOM\n", encoding);

		if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
			size_t n;
			char *utf8;

			len -= len & 1; // ignore single trailing byte, else charset conversion fails

			if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
				printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
				data = utf8;
			} else {
				printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
				return;
			}
		}

		WGET_HTML_PARSED_RESULT *res  = wget_html_get_urls_inline(data, NULL, NULL);

		if (encoding) {
			if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
				printf("Encoding '%s' as stated in document has been ignored\n", encoding);
		}

		for (int it = 0; it < wget_vector_size(res->uris); it++) {
			WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it);
			wget_string_t *url = &html_url->url;

			printf("  %s.%s '%.*s'\n", html_url->dir, html_url->attr, (int) url->len, url->p);
		}

		wget_xfree(data);
		wget_html_free_urls_inline(&res);
	}
}