Esempio n. 1
0
/* Cleanup the data structures associated with recursive retrieving
   (the variables above).  */
void
recursive_cleanup (void)
{
  if (undesirable_urls)
    {
      string_set_free (undesirable_urls);
      undesirable_urls = NULL;
    }
  if (dl_file_url_map)
    {
      free_keys_and_values (dl_file_url_map);
      hash_table_destroy (dl_file_url_map);
      dl_file_url_map = NULL;
    }
  if (dl_url_file_map)
    {
      free_keys_and_values (dl_url_file_map);
      hash_table_destroy (dl_url_file_map);
      dl_url_file_map = NULL;
    }
  undesirable_urls = NULL;
  free_vec (forbidden);
  forbidden = NULL;
  slist_free (downloaded_html_files);
  downloaded_html_files = NULL;
  FREE_MAYBE (base_dir);
  FREE_MAYBE (robots_host);
  first_time = 1;
}
Esempio n. 2
0
static void
delete_cookie (struct cookie *cookie)
{
  FREE_MAYBE (cookie->domain);
  FREE_MAYBE (cookie->path);
  FREE_MAYBE (cookie->attr);
  FREE_MAYBE (cookie->value);
  xfree (cookie);
}
Esempio n. 3
0
/* Cleanup the data structures associated with recursive retrieving
   (the variables above).  */
void
recursive_cleanup (void)
{
  free_slist (ulist);
  ulist = NULL;
  free_vec (forbidden);
  forbidden = NULL;
  free_slist (urls_html);
  urls_html = NULL;
  free_urlpos (urls_downloaded);
  urls_downloaded = NULL;
  FREE_MAYBE (base_dir);
  FREE_MAYBE (robots_host);
  first_time = 1;
}
Esempio n. 4
0
File: retr.c Progetto: aosm/wget
/* Find the URL-s in the file and call retrieve_url() for each of
   them.  If HTML is non-zero, treat the file as HTML, and construct
   the URL-s accordingly.

   If opt.recursive is set, call recursive_retrieve() for each file.  */
uerr_t
retrieve_from_file (const char *file, int html, int *count)
{
  uerr_t status;
  urlpos *url_list, *cur_url;

  /* If spider-mode is on, we do not want get_urls_html barfing
     errors on baseless links.  */
  url_list = (html ? get_urls_html (file, NULL, opt.spider)
	      : get_urls_file (file));
  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */
  recursive_reset ();
  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
    {
      char *filename, *new_file;
      int dt;

      if (opt.quota && opt.downloaded > opt.quota)
	{
	  status = QUOTEXC;
	  break;
	}
      status = retrieve_url (cur_url->url, &filename, &new_file, NULL, &dt);
      if (opt.recursive && status == RETROK && (dt & TEXTHTML))
	status = recursive_retrieve (filename, new_file ? new_file : cur_url->url);

      if (filename && opt.delete_after && file_exists_p (filename))
	{
	  logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
	  if (unlink (filename))
	    logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	  dt &= ~RETROKF;
	}

      FREE_MAYBE (new_file);
      FREE_MAYBE (filename);
    }

  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);

  return status;
}
Esempio n. 5
0
/*History:
2015-01-29 17:28 初步看了下,没有例子,不知道实际解析时的情况,比较糊涂。最好是自己写例子测试一下的。另外看起来得了解一点html语法。

*/
const char *
htmlfindurl(const char *buf, int bufsize, int *size, int init)
{
    const char *p, *ph;
    state_t *s;

    /* NULL-terminated list of tags and modifiers someone would want to
       follow -- feel free to edit to suit your needs: */
    //z 允许的 html tags;声明为static ,只要一份即可。
    static struct tag_attr html_allow[] =
    {
        //z tag : attr 值对
        { "a", "href" },
        { "img", "src" },
        { "img", "href" },
        { "body", "background" },
        { "frame", "src" },
        { "iframe", "src" },
        { "fig", "src" },
        { "overlay", "src" },
        { "applet", "code" },
        { "script", "src" },
        { "embed", "src" },
        { "bgsound", "src" },
        { "area", "href" },
        { "img", "lowsrc" },
        { "input", "src" },
        { "layer", "src" },
        { "table", "background"},
        { "th", "background"},
        { "td", "background"},
        /* Tags below this line are treated specially.  */
        { "base", "href" },
        { "meta", "content" },
        { NULL, NULL }//z 最后以NULL作为结尾
    };

    s = &global_state;

    if (init)
    {
        DEBUGP (("Resetting a parser state.\n"));
        memset (s, 0, sizeof (*s));
    }

    while (1)
    {
        //z 如果 bufsize 为0,跳出循环
        if (!bufsize)
            break;

        /* Let's look for a tag, if we are not already in one.  */
        //z 首先寻找 tag
        if (!s->at_value)
        {
            /* Find '<'.  */
            //z 找到 <
            if (*buf != '<')
                for (; bufsize && *buf != '<'; ++buf, --bufsize);

            //z 如果 bufsize 为0 ,那么到达了结尾
            if (!bufsize)
                break;

            /* Skip spaces.  */
            //z 在处理的时候,跳过空格
            for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                    ++buf, --bufsize);

            if (!bufsize)
                break;

            p = buf;

            /* Find the tag end.  */
            //z 直到找到空格或者找到 >,或者找到 =,或者到达结尾。
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);

            if (!bufsize)
                break;

            //z 如果找到了 =
            if (*buf == '=')
            {
                /* <tag=something> is illegal.  Just skip it.  */
                ++buf, --bufsize;

                continue;
            }

            if (p == buf)
            {
                /* *buf == '>'.  */
                ++buf, --bufsize;

                continue;
            }

            s->tag = strdupdelim (p, buf);

            if (*buf == '>')
            {
                free (s->tag);
                s->tag = NULL;
                ++buf, --bufsize;
                continue;
            }
        }
        else                      /* s->at_value */
        {
            //z 这意思是在查找 value 。
            /* Reset AT_VALUE.  */
            s->at_value = 0;
            /* If in quotes, just skip out of them and continue living.  */
            if (s->in_quote)
            {
                s->in_quote = 0;
                for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);

                if (!bufsize)
                    break;
                ++buf, --bufsize;
            }

            if (!bufsize)
                break;

            if (*buf == '>')
            {
                FREE_MAYBE (s->tag);
                FREE_MAYBE (s->attr);
                s->tag = s->attr = NULL;
                continue;
            }
        }

        /* Find the attributes.  */
        do
        {
            FREE_MAYBE (s->attr);
            s->attr = NULL;

            if (!bufsize)
                break;
            /* Skip the spaces if we have them.  We don't have them at
            places like <img alt="something"src="something-else">.
            ^ no spaces here */

            if (ISSPACE (*buf))
                for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize);

            if (!bufsize || *buf == '>')
                break;

            if (*buf == '=')
            {
                /* This is the case of <tag = something>, which is
                  illegal.  Just skip it.  */
                ++buf, --bufsize;
                continue;
            }

            p = buf;
            /* Find the attribute end.  */
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);

            if (!bufsize || *buf == '>')
                break;

            //z 找到这其间的值为 attr 。
            /* Construct the attribute.  */
            s->attr = strdupdelim (p, buf);
            /* Now we must skip the spaces to find '='.  */
            if (*buf != '=')
            {
                for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
                if (!bufsize || *buf == '>')
                    break;
            }

            /* If we still don't have '=', something is amiss.  */
            //z 是否找到了 = ,如果没有找到 = ,可能出现了错误。
            if (*buf != '=')
                continue;

            /* Find the beginning of attribute value by skipping the
            spaces.  */
            ++buf, --bufsize;
            //z 越过若干个空白字符。
            for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
            //z 是否结束或者找到了‘>’
            if (!bufsize || *buf == '>')
                break;
            ph = NULL;
            /* The value of an attribute can, but does not have to be
            quoted.  */
            //z 如果当前字符为 ' 或者 " , 进入引号状态
            if (*buf == '\"' || *buf == '\'')
            {
                //z 进入 quote 状态
                s->in_quote = 1;
                //z 记住当前的字符串,方便寻找到下一个做比对。
                s->quote_char = *buf;
                //z p 指向引号内第一个字符
                p = buf + 1;
                //z 步进,直到找到另一个引号,或者遇到了回车
                for (++buf, --bufsize;
                        bufsize && *buf != s->quote_char && *buf != '\n';
                        ++buf, --bufsize)

                    //z 如果当前字符串为 # , 记录下其位置
                    if (*buf == '#')
                        ph = buf;
                if (!bufsize)
                {
                    //z 如果到达了字符结尾,结束 in_quote 状态
                    s->in_quote = 0;
                    break;
                }
                //z 如果遇到了 '\n' ,继续下一轮。
                if (*buf == '\n')
                {
                    /* #### Is the following logic good?

                     Obviously no longer in quote.  It might be well
                     to check whether '>' was encountered, but that
                     would be encouraging writers of invalid HTMLs,
                      and we don't want that, now do we?  */
                    s->in_quote = 0;
                    continue;
                }
            }
            else
            {
                p = buf;

                for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
                    if (*buf == '#')
                        ph = buf;
                if (!bufsize)
                    break;
            }

            //z URI 中的# ( found unprotected 是什么意思? ) # 可能表示的意思是一个 html marker 或者 color spec 。
            /* If '#' was found unprotected in a URI, it is probably an
            HTML marker, or color spec.  */
            //z 如果有 # ,那么将 ph 视作结束?
            *size = (ph ? ph : buf) - p;
            /* The URI is liable to be returned if:
            1) *size != 0;
            2) its tag and attribute are found in html_allow.  */
            //z 实际可能表示的例子有 : <a href="http://www.w3school.com.cn/">Visit W3School</a> 这个样子
            if (*size && idmatch (html_allow, s->tag, s->attr))
            {
                if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
                {
                    FREE_MAYBE (s->base);
                    s->base = strdupdelim (p, buf);
                }
                //z 比对 meta 和 content
                else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
                {
                    /* Some pages use a META tag to specify that the page
                    be refreshed by a new page after a given number of
                    seconds.  We need to attempt to extract an URL for
                    the new page from the other garbage present.  The
                    general format for this is:
                    <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">

                     So we just need to skip past the "0; URL="
                     garbage to get to the URL.  META tags are also
                     used for specifying random things like the page
                     author's name and what editor was used to create
                     it.  So we need to be careful to ignore them and
                      not assume that an URL will be present at all.  */
                    //z 只要是数字,那么持续向前
                    for (; *size && ISDIGIT (*p); p++, *size -= 1);

                    //z 查看是否会遇到 ;
                    if (*p == ';')
                    {
                        //z 跳过 space。
                        for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
                        //z 比对,是否找到了 URL,
                        if (!strncasecmp (p, "URL=", 4))
                        {
                            //z 如果在 meta 中找到了 URL
                            p += 4, *size -= 4;
                            s->at_value = 1;
                            //z 这意思是直接返回 p?
                            return p;
                        }
                    }
                }
                else
                {
                    s->at_value = 1;
                    return p;
                }
            }

            /* Exit from quote.  */
            if (*buf == s->quote_char)
            {
                s->in_quote = 0;
                ++buf, --bufsize;
            }
        }
        while (*buf != '>');

        FREE_MAYBE (s->tag);
        FREE_MAYBE (s->attr);
        s->tag = s->attr = NULL;

        if (!bufsize)
            break;
    }

    FREE_MAYBE (s->tag);
    FREE_MAYBE (s->attr);
    FREE_MAYBE (s->base);

    memset (s, 0, sizeof (*s));	/* just to be sure */
    DEBUGP (("HTML parser ends here (state destroyed).\n"));

    return NULL;
}
Esempio n. 6
0
/* The function creates an HTML index containing references to given
   directories and files on the appropriate host.  The references are
   FTP.  */
uerr_t
ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
{
    FILE *fp;
    char *upwd;
    char *htclfile;		/* HTML-clean file name */

    if (!opt.dfp)
    {
        fp = fopen (file, "wb");

        if (!fp)
        {
            logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
            return FOPENERR;
        }
    }
    else
        fp = opt.dfp;

    if (u->user)
    {
        char *tmpu, *tmpp;        /* temporary, clean user and passwd */

        tmpu = CLEANDUP (u->user);
        tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
        upwd = (char *)xmalloc (strlen (tmpu)
                                + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);

        sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");

        free (tmpu);
        FREE_MAYBE (tmpp);
    }
    else
        upwd = xstrdup ("");

    fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
    fprintf (fp, "<html>\n<head>\n<title>");
    fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
    fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
    fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
    fprintf (fp, "</h1>\n<hr>\n<pre>\n");

    while (f)
    {
        fprintf (fp, "  ");
        if (f->tstamp != -1)
        {
            /* #### Should we translate the months? */
            static char *months[] =
            {
                "Jan", "Feb", "Mar", "Apr", "May", "Jun",
                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
            };
            struct tm *ptm = localtime ((time_t *)&f->tstamp);

            fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
                     ptm->tm_mday);
            if (ptm->tm_hour)
                fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
            else
                fprintf (fp, "       ");
        }
        else
            fprintf (fp, _("time unknown       "));

        switch (f->type)
        {
        case FT_PLAINFILE:
            fprintf (fp, _("File        "));
            break;
        case FT_DIRECTORY:
            fprintf (fp, _("Directory   "));
            break;
        case FT_SYMLINK:
            fprintf (fp, _("Link        "));
            break;
        default:
            fprintf (fp, _("Not sure    "));
            break;
        }

        htclfile = html_quote_string (f->name);
        fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
        if (*u->dir != '/')
            putc ('/', fp);
        fprintf (fp, "%s", u->dir);
        if (*u->dir)
            putc ('/', fp);
        fprintf (fp, "%s", htclfile);
        if (f->type == FT_DIRECTORY)
            putc ('/', fp);
        fprintf (fp, "\">%s", htclfile);
        if (f->type == FT_DIRECTORY)
            putc ('/', fp);
        fprintf (fp, "</a> ");
        if (f->type == FT_PLAINFILE)
            fprintf (fp, _(" (%s bytes)"), legible (f->size));
        else if (f->type == FT_SYMLINK)
            fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");

        putc ('\n', fp);
        free (htclfile);
        f = f->next;
    }

    fprintf (fp, "</pre>\n</body>\n</html>\n");
    free (upwd);

    if (!opt.dfp)
        fclose (fp);
    else
        fflush (fp);

    return FTPOK;
}
Esempio n. 7
0
static int
update_cookie_field (struct cookie *cookie,
		     const char *name_b, const char *name_e,
		     const char *value_b, const char *value_e)
{
  assert (name_b != NULL && name_e != NULL);

  if (!cookie->attr)
    {
      if (!VALUE_EXISTS)
	return 0;
      cookie->attr = strdupdelim (name_b, name_e);
      cookie->value = strdupdelim (value_b, value_e);
      return 1;
    }

  if (NAME_IS ("domain"))
    {
      if (!VALUE_NON_EMPTY)
	return 0;
      FREE_MAYBE (cookie->domain);
      cookie->domain = strdupdelim (value_b, value_e);
      return 1;
    }
  else if (NAME_IS ("path"))
    {
      if (!VALUE_NON_EMPTY)
	return 0;
      FREE_MAYBE (cookie->path);
      cookie->path = strdupdelim (value_b, value_e);
      return 1;
    }
  else if (NAME_IS ("expires"))
    {
      char *value_copy;
      time_t expires;

      if (!VALUE_NON_EMPTY)
	return 0;
      BOUNDED_TO_ALLOCA (value_b, value_e, value_copy);

      expires = http_atotm (value_copy);
      if (expires != -1)
	{
	  cookie->permanent = 1;
	  cookie->expiry_time = (unsigned long)expires;
	}
      else
	/* Error in expiration spec.  Assume default (cookie valid for
	   this session.)  #### Should we return 0 and invalidate the
	   cookie?  */
	;

      /* According to netscape's specification, expiry time in the
	 past means that discarding of a matching cookie is
	 requested.  */
      if (cookie->expiry_time < cookies_now)
	cookie->discard_requested = 1;

      return 1;
    }
  else if (NAME_IS ("max-age"))
    {
      double maxage = -1;
      char *value_copy;

      if (!VALUE_NON_EMPTY)
	return 0;
      BOUNDED_TO_ALLOCA (value_b, value_e, value_copy);

      sscanf (value_copy, "%lf", &maxage);
      if (maxage == -1)
	/* something is wrong. */
	return 0;
      cookie->permanent = 1;
      cookie->expiry_time = (unsigned long)cookies_now + (unsigned long)maxage;

      /* According to rfc2109, a cookie with max-age of 0 means that
	 discarding of a matching cookie is requested.  */
      if (maxage == 0)
	cookie->discard_requested = 1;

      return 1;
    }
  else if (NAME_IS ("secure"))
    {
      /* ignore value completely */
      cookie->secure = 1;
      return 1;
    }
  else
    /* Unrecognized attribute; ignore it. */
    return 1;
}
Esempio n. 8
0
/* Convert the Un*x-ish style directory listing stored in FILE to a
   linked list of fileinfo (system-independent) entries.  The contents
   of FILE are considered to be produced by the standard Unix `ls -la'
   output (whatever that might be).  BSD (no group) and SYSV (with
   group) listings are handled.

   The time stamps are stored in a separate variable, time_t
   compatible (I hope).  The timezones are ignored.  */
static struct fileinfo *
ftp_parse_unix_ls (const char *file)
{
  FILE *fp;
  static const char *months[] = {
    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
  };
  int next, len, i, error, ignore;
  int year, month, day;		/* for time analysis */
  int hour, min, sec;
  struct tm timestruct, *tnow;
  time_t timenow;

  char *line, *tok;		/* tokenizer */
  struct fileinfo *dir, *l, cur; /* list creation */

  fp = fopen (file, "rb");
  if (!fp)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
  dir = l = NULL;

  /* Line loop to end of file: */
  while ((line = read_whole_line (fp)))
    {
      DEBUGP (("%s\n", line));
      len = strlen (line);
      /* Destroy <CR> if there is one.  */
      if (len && line[len - 1] == '\r')
	line[--len] = '\0';

      /* Skip if total...  */
      if (!strncasecmp (line, "total", 5))
	{
	  free (line);
	  continue;
	}
      /* Get the first token (permissions).  */
      tok = strtok (line, " ");
      if (!tok)
	{
	  free (line);
	  continue;
	}

      cur.name = NULL;
      cur.linkto = NULL;

      /* Decide whether we deal with a file or a directory.  */
      switch (*tok)
	{
	case '-':
	  cur.type = FT_PLAINFILE;
	  DEBUGP (("PLAINFILE; "));
	  break;
	case 'd':
	  cur.type = FT_DIRECTORY;
	  DEBUGP (("DIRECTORY; "));
	  break;
	case 'l':
	  cur.type = FT_SYMLINK;
	  DEBUGP (("SYMLINK; "));
	  break;
	default:
	  cur.type = FT_UNKNOWN;
	  DEBUGP (("UNKOWN; "));
	  break;
	}

      cur.perms = symperms (tok + 1);
      DEBUGP (("perms %0o; ", cur.perms));

      error = ignore = 0;       /* Errnoeous and ignoring entries are
				   treated equally for now.  */
      year = hour = min = sec = 0; /* Silence the compiler.  */
      month = day = 0;
      next = -1;
      /* While there are tokens on the line, parse them.  Next is the
	 number of tokens left until the filename.

	 Use the month-name token as the "anchor" (the place where the
	 position wrt the file name is "known").  When a month name is
	 encountered, `next' is set to 5.  Also, the preceding
	 characters are parsed to get the file size.

	 This tactic is quite dubious when it comes to
	 internationalization issues (non-English month names), but it
	 works for now.  */
      while ((tok = strtok (NULL, " ")))
	{
	  --next;
	  if (next < 0)		/* a month name was not encountered */
	    {
	      for (i = 0; i < 12; i++)
		if (!strcmp (tok, months[i]))
		  break;
	      /* If we got a month, it means the token before it is the
		 size, and the filename is three tokens away.  */
	      if (i != 12)
		{
		  char *t = tok - 2;
		  long mul = 1;

		  for (cur.size = 0; t > line && ISDIGIT (*t); mul *= 10, t--)
		    cur.size += mul * (*t - '0');
		  if (t == line)
		    {
		      /* Something is seriously wrong.  */
		      error = 1;
		      break;
		    }
		  month = i;
		  next = 5;
		  DEBUGP (("month: %s; ", months[month]));
		}
	    }
	  else if (next == 4)	/* days */
	    {
	      if (tok[1])	/* two-digit... */
		day = 10 * (*tok - '0') + tok[1] - '0';
	      else		/* ...or one-digit */
		day = *tok - '0';
	      DEBUGP (("day: %d; ", day));
	    }
	  else if (next == 3)
	    {
	      /* This ought to be either the time, or the year.  Let's
		 be flexible!

		 If we have a number x, it's a year.  If we have x:y,
		 it's hours and minutes.  If we have x:y:z, z are
		 seconds.  */
	      year = 0;
	      min = hour = sec = 0;
	      /* We must deal with digits.  */
	      if (ISDIGIT (*tok))
		{
		  /* Suppose it's year.  */
		  for (; ISDIGIT (*tok); tok++)
		    year = (*tok - '0') + 10 * year;
		  if (*tok == ':')
		    {
		      /* This means these were hours!  */
		      hour = year;
		      year = 0;
		      ++tok;
		      /* Get the minutes...  */
		      for (; ISDIGIT (*tok); tok++)
			min = (*tok - '0') + 10 * min;
		      if (*tok == ':')
			{
			  /* ...and the seconds.  */
			  ++tok;
			  for (; ISDIGIT (*tok); tok++)
			    sec = (*tok - '0') + 10 * sec;
			}
		    }
		}
	      if (year)
		DEBUGP (("year: %d (no tm); ", year));
	      else
		DEBUGP (("time: %02d:%02d:%02d (no yr); ", hour, min, sec));
	    }
	  else if (next == 2)    /* The file name */
	    {
	      int fnlen;
	      char *p;

	      /* Since the file name may contain a SPC, it is possible
		 for strtok to handle it wrong.  */
	      fnlen = strlen (tok);
	      if (fnlen < len - (tok - line))
		{
		  /* So we have a SPC in the file name.  Restore the
		     original.  */
		  tok[fnlen] = ' ';
		  /* If the file is a symbolic link, it should have a
		     ` -> ' somewhere.  */
		  if (cur.type == FT_SYMLINK)
		    {
		      p = strstr (tok, " -> ");
		      if (!p)
			{
			  error = 1;
			  break;
			}
		      cur.linkto = xstrdup (p + 4);
		      DEBUGP (("link to: %s\n", cur.linkto));
		      /* And separate it from the file name.  */
		      *p = '\0';
		    }
		}
	      /* If we have the filename, add it to the list of files or
		 directories.  */
	      /* "." and ".." are an exception!  */
	      if (!strcmp (tok, ".") || !strcmp (tok, ".."))
		{
		  DEBUGP (("\nIgnoring `.' and `..'; "));
		  ignore = 1;
		  break;
		}
	      /* Some FTP sites choose to have ls -F as their default
		 LIST output, which marks the symlinks with a trailing
		 `@', directory names with a trailing `/' and
		 executables with a trailing `*'.  This is no problem
		 unless encountering a symbolic link ending with `@',
		 or an executable ending with `*' on a server without
		 default -F output.  I believe these cases are very
		 rare.  */
	      fnlen = strlen (tok); /* re-calculate `fnlen' */
	      cur.name = (char *)xmalloc (fnlen + 1);
	      memcpy (cur.name, tok, fnlen + 1);
	      if (fnlen)
		{
		  if (cur.type == FT_DIRECTORY && cur.name[fnlen - 1] == '/')
		    {
		      cur.name[fnlen - 1] = '\0';
		      DEBUGP (("trailing `/' on dir.\n"));
		    }
		  else if (cur.type == FT_SYMLINK && cur.name[fnlen - 1] == '@')
		    {
		      cur.name[fnlen - 1] = '\0';
		      DEBUGP (("trailing `@' on link.\n"));
		    }
		  else if (cur.type == FT_PLAINFILE
			   && (cur.perms & 0111)
			   && cur.name[fnlen - 1] == '*')
		    {
		      cur.name[fnlen - 1] = '\0';
		      DEBUGP (("trailing `*' on exec.\n"));
		    }
		} /* if (fnlen) */
	      else
		error = 1;
	      break;
	    }
	  else
	    abort ();
	} /* while */

      if (!cur.name || (cur.type == FT_SYMLINK && !cur.linkto))
	error = 1;

      DEBUGP (("\n"));

      if (error || ignore)
	{
	  DEBUGP (("Skipping.\n"));
	  FREE_MAYBE (cur.name);
	  FREE_MAYBE (cur.linkto);
	  free (line);
	  continue;
	}

      if (!dir)
	{
	  l = dir = (struct fileinfo *)xmalloc (sizeof (struct fileinfo));
	  memcpy (l, &cur, sizeof (cur));
	  l->prev = l->next = NULL;
	}
      else
	{
	  cur.prev = l;
	  l->next = (struct fileinfo *)xmalloc (sizeof (struct fileinfo));
	  l = l->next;
	  memcpy (l, &cur, sizeof (cur));
	  l->next = NULL;
	}
      /* Get the current time.  */
      timenow = time (NULL);
      tnow = localtime (&timenow);
      /* Build the time-stamp (the idea by [email protected]).  */
      timestruct.tm_sec   = sec;
      timestruct.tm_min   = min;
      timestruct.tm_hour  = hour;
      timestruct.tm_mday  = day;
      timestruct.tm_mon   = month;
      if (year == 0)
	{
	  /* Some listings will not specify the year if it is "obvious"
	     that the file was from the previous year.  E.g. if today
	     is 97-01-12, and you see a file of Dec 15th, its year is
	     1996, not 1997.  Thanks to Vladimir Volovich for
	     mentioning this!  */
	  if (month > tnow->tm_mon)
	    timestruct.tm_year = tnow->tm_year - 1;
	  else
	    timestruct.tm_year = tnow->tm_year;
	}
      else
	timestruct.tm_year = year;
      if (timestruct.tm_year >= 1900)
	timestruct.tm_year -= 1900;
      timestruct.tm_wday  = 0;
      timestruct.tm_yday  = 0;
      timestruct.tm_isdst = -1;
      l->tstamp = mktime (&timestruct); /* store the time-stamp */

      free (line);
    }

  fclose (fp);
  return dir;
}
Esempio n. 9
0
uerr_t
retrieve_tree (const char *start_url)
{
  uerr_t status = RETROK;

  /* The queue of URLs we need to load. */
  struct url_queue *queue;

  /* The URLs we do not wish to enqueue, because they are already in
     the queue, but haven't been downloaded yet.  */
  struct hash_table *blacklist;

  int up_error_code;
  struct url *start_url_parsed = url_parse (start_url, &up_error_code);

  if (!start_url_parsed)
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
		 url_error (up_error_code));
      return URLERROR;
    }

  queue = url_queue_new ();
  blacklist = make_string_hash_table (0);

  /* Enqueue the starting URL.  Use start_url_parsed->url rather than
     just URL so we enqueue the canonical form of the URL.  */
  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
  string_set_add (blacklist, start_url_parsed->url);

  while (1)
    {
      int descend = 0;
      char *url, *referer, *file = NULL;
      int depth, html_allowed;
      boolean dash_p_leaf_HTML = FALSE;

      if (opt.quota && total_downloaded_bytes > opt.quota)
	break;
      if (status == FWRITEERR)
	break;

      /* Get the next URL from the queue... */

      if (!url_dequeue (queue,
			(const char **)&url, (const char **)&referer,
			&depth, &html_allowed))
	break;

      /* ...and download it.  Note that this download is in most cases
	 unconditional, as download_child_p already makes sure a file
	 doesn't get enqueued twice -- and yet this check is here, and
	 not in download_child_p.  This is so that if you run `wget -r
	 URL1 URL2', and a random URL is encountered once under URL1
	 and again under URL2, but at a different (possibly smaller)
	 depth, we want the URL's children to be taken into account
	 the second time.  */
      if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
	{
	  file = xstrdup (hash_table_get (dl_url_file_map, url));

	  DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
		   url, file));

	  if (html_allowed
	      && downloaded_html_set
	      && string_set_contains (downloaded_html_set, file))
	    descend = 1;
	}
      else
	{
	  int dt = 0;
	  char *redirected = NULL;
	  int oldrec = opt.recursive;

	  opt.recursive = 0;
	  status = retrieve_url (url, &file, &redirected, referer, &dt);
	  opt.recursive = oldrec;

	  if (html_allowed && file && status == RETROK
	      && (dt & RETROKF) && (dt & TEXTHTML))
	    descend = 1;

	  if (redirected)
	    {
	      /* We have been redirected, possibly to another host, or
		 different path, or wherever.  Check whether we really
		 want to follow it.  */
	      if (descend)
		{
		  if (!descend_redirect_p (redirected, url, depth,
					   start_url_parsed, blacklist))
		    descend = 0;
		  else
		    /* Make sure that the old pre-redirect form gets
		       blacklisted. */
		    string_set_add (blacklist, url);
		}

	      xfree (url);
	      url = redirected;
	    }
	}

      if (descend
	  && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
	{
	  if (opt.page_requisites
	      && (depth == opt.reclevel || depth == opt.reclevel + 1))
	    {
	      /* When -p is specified, we are allowed to exceed the
		 maximum depth, but only for the "inline" links,
		 i.e. those that are needed to display the page.
		 Originally this could exceed the depth at most by
		 one, but we allow one more level so that the leaf
		 pages that contain frames can be loaded
		 correctly.  */
	      dash_p_leaf_HTML = TRUE;
	    }
	  else
	    {
	      /* Either -p wasn't specified or it was and we've
		 already spent the two extra (pseudo-)levels that it
		 affords us, so we need to bail out. */
	      DEBUGP (("Not descending further; at depth %d, max. %d.\n",
		       depth, opt.reclevel));
	      descend = 0;
	    }
	}

      /* If the downloaded document was HTML, parse it and enqueue the
	 links it contains. */

      if (descend)
	{
	  int meta_disallow_follow = 0;
	  struct urlpos *children
	    = get_urls_html (file, url, &meta_disallow_follow);

	  if (opt.use_robots && meta_disallow_follow)
	    {
	      free_urlpos (children);
	      children = NULL;
	    }

	  if (children)
	    {
	      struct urlpos *child = children;
	      struct url *url_parsed = url_parsed = url_parse (url, NULL);
	      assert (url_parsed != NULL);

	      for (; child; child = child->next)
		{
		  if (child->ignore_when_downloading)
		    continue;
		  if (dash_p_leaf_HTML && !child->link_inline_p)
		    continue;
		  if (download_child_p (child, url_parsed, depth, start_url_parsed,
					blacklist))
		    {
		      url_enqueue (queue, xstrdup (child->url->url),
				   xstrdup (url), depth + 1,
				   child->link_expect_html);
		      /* We blacklist the URL we have enqueued, because we
			 don't want to enqueue (and hence download) the
			 same URL twice.  */
		      string_set_add (blacklist, child->url->url);
		    }
		}

	      url_free (url_parsed);
	      free_urlpos (children);
	    }
	}

      if (opt.delete_after || (file && !acceptable (file)))
	{
	  /* Either --delete-after was specified, or we loaded this
	     otherwise rejected (e.g. by -R) HTML file just so we
	     could harvest its hyperlinks -- in either case, delete
	     the local file. */
	  DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		   opt.delete_after ? "--delete-after" :
		   "recursive rejection criteria"));
	  logprintf (LOG_VERBOSE,
		     (opt.delete_after
		      ? _("Removing %s.\n")
		      : _("Removing %s since it should be rejected.\n")),
		     file);
	  if (unlink (file))
	    logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	  register_delete_file (file);
	}

      xfree (url);
      FREE_MAYBE (referer);
      FREE_MAYBE (file);
    }

  /* If anything is left of the queue due to a premature exit, free it
     now.  */
  {
    char *d1, *d2;
    int d3, d4;
    while (url_dequeue (queue,
			(const char **)&d1, (const char **)&d2, &d3, &d4))
      {
	xfree (d1);
	FREE_MAYBE (d2);
      }
  }
  url_queue_delete (queue);

  if (start_url_parsed)
    url_free (start_url_parsed);
  string_set_free (blacklist);

  if (opt.quota && total_downloaded_bytes > opt.quota)
    return QUOTEXC;
  else if (status == FWRITEERR)
    return FWRITEERR;
  else
    return RETROK;
}
Esempio n. 10
0
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
   describing URLs to follow.  When a tag is encountered, extract its
   components (as described by html_allow[] array), and return the
   address and the length of the string.  Return NULL if no URL is
   found.  */
const char *
htmlfindurl (const char *buf, int bufsize, int *size, int init)
{
  const char *p, *ph;
  state_t *s;
  /* NULL-terminated list of tags and modifiers someone would want to
     follow -- feel free to edit to suit your needs: */
  static struct tag_attr html_allow[] = {
    { "a", "href" },
    { "img", "src" },
    { "img", "href" },
    { "body", "background" },
    { "frame", "src" },
    { "iframe", "src" },
    { "fig", "src" },
    { "overlay", "src" },
    { "applet", "code" },
    { "script", "src" },
    { "embed", "src" },
    { "bgsound", "src" },
    { "area", "href" },
    { "img", "lowsrc" },
    { "input", "src" },
    { "layer", "src" },
    { "table", "background"},
    { "th", "background"},
    { "td", "background"},
    /* Tags below this line are treated specially.  */
    { "base", "href" },
    { "meta", "content" },
    { NULL, NULL }
  };

  s = &global_state;

  if (init)
    {
      DEBUGP (("Resetting a parser state.\n"));
      memset (s, 0, sizeof (*s));
    }

  while (1)
    {
      if (!bufsize)
	break;
      /* Let's look for a tag, if we are not already in one.  */
      if (!s->at_value)
	{
	  /* Find '<'.  */
	  if (*buf != '<')
	    for (; bufsize && *buf != '<'; ++buf, --bufsize);
	  if (!bufsize)
	    break;
	  /* Skip spaces.  */
	  for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
	       ++buf, --bufsize);
	  if (!bufsize)
	    break;
	  p = buf;
	  /* Find the tag end.  */
	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
	       ++buf, --bufsize);
	  if (!bufsize)
	    break;
	  if (*buf == '=')
	    {
	      /* <tag=something> is illegal.  Just skip it.  */
	      ++buf, --bufsize;
	      continue;
	    }
	  if (p == buf)
	    {
	      /* *buf == '>'.  */
	      ++buf, --bufsize;
	      continue;
	    }
	  s->tag = strdupdelim (p, buf);
	  if (*buf == '>')
	    {
	      free (s->tag);
	      s->tag = NULL;
	      ++buf, --bufsize;
	      continue;
	    }
	}
      else                      /* s->at_value */
	{
	  /* Reset AT_VALUE.  */
	  s->at_value = 0;
	  /* If in quotes, just skip out of them and continue living.  */
	  if (s->in_quote)
	    {
	      s->in_quote = 0;
	      for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
	      if (!bufsize)
		break;
	      ++buf, --bufsize;
	    }
	  if (!bufsize)
	    break;
	  if (*buf == '>')
	    {
	      FREE_MAYBE (s->tag);
	      FREE_MAYBE (s->attr);
	      s->tag = s->attr = NULL;
	      continue;
	    }
	}
      /* Find the attributes.  */
      do
	{
	  FREE_MAYBE (s->attr);
	  s->attr = NULL;
	  if (!bufsize)
	    break;
	  /* Skip the spaces if we have them.  We don't have them at
	     places like <img alt="something"src="something-else">.
	                                     ^ no spaces here */
	  if (ISSPACE (*buf))
	    for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
		 ++buf, --bufsize);
	  if (!bufsize || *buf == '>')
	    break;
	  if (*buf == '=')
	    {
	      /* This is the case of <tag = something>, which is
		 illegal.  Just skip it.  */
	      ++buf, --bufsize;
	      continue;
	    }
	  p = buf;
	  /* Find the attribute end.  */
	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
	       ++buf, --bufsize);
	  if (!bufsize || *buf == '>')
	    break;
	  /* Construct the attribute.  */
	  s->attr = strdupdelim (p, buf);
	  /* Now we must skip the spaces to find '='.  */
	  if (*buf != '=')
	    {
	      for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
	      if (!bufsize || *buf == '>')
		break;
	    }
	  /* If we still don't have '=', something is amiss.  */
	  if (*buf != '=')
	    continue;
	  /* Find the beginning of attribute value by skipping the
	     spaces.  */
	  ++buf, --bufsize;
	  for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
	  if (!bufsize || *buf == '>')
	    break;
	  ph = NULL;
	  /* The value of an attribute can, but does not have to be
	     quoted.  */
	  if (*buf == '\"' || *buf == '\'')
	    {
	      s->in_quote = 1;
	      s->quote_char = *buf;
	      p = buf + 1;
	      for (++buf, --bufsize;
		   bufsize && *buf != s->quote_char && *buf != '\n';
		   ++buf, --bufsize)
		if (*buf == '#')
		  ph = buf;
	      if (!bufsize)
		{
		  s->in_quote = 0;
		  break;
		}
	      if (*buf == '\n')
		{
		  /* #### Is the following logic good?

		     Obviously no longer in quote.  It might be well
		     to check whether '>' was encountered, but that
		     would be encouraging writers of invalid HTMLs,
		     and we don't want that, now do we?  */
		  s->in_quote = 0;
		  continue;
		}
	    }
	  else
	    {
	      p = buf;
	      for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
		if (*buf == '#')
		  ph = buf;
	      if (!bufsize)
		break;
	    }
	  /* If '#' was found unprotected in a URI, it is probably an
	     HTML marker, or color spec.  */
	  *size = (ph ? ph : buf) - p;
	  /* The URI is liable to be returned if:
	     1) *size != 0;
	     2) its tag and attribute are found in html_allow.  */
	  if (*size && idmatch (html_allow, s->tag, s->attr))
	    {
	      if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
		{
		  FREE_MAYBE (s->base);
		  s->base = strdupdelim (p, buf);
		}
	      else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
		{
		  /* Some pages use a META tag to specify that the page
		     be refreshed by a new page after a given number of
		     seconds.  We need to attempt to extract an URL for
		     the new page from the other garbage present.  The
		     general format for this is:                  
		     <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">

		     So we just need to skip past the "0; URL="
		     garbage to get to the URL.  META tags are also
		     used for specifying random things like the page
		     author's name and what editor was used to create
		     it.  So we need to be careful to ignore them and
		     not assume that an URL will be present at all.  */
		  for (; *size && ISDIGIT (*p); p++, *size -= 1);
		  if (*p == ';')
		    {
		      for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
		      if (!strncasecmp (p, "URL=", 4))
			{
			  p += 4, *size -= 4;
			  s->at_value = 1;
			  return p;
			}
		    }
		}
	      else
		{
		  s->at_value = 1;
		  return p;
		}
	    }
	  /* Exit from quote.  */
	  if (*buf == s->quote_char)
	    {
	      s->in_quote = 0;
	      ++buf, --bufsize;
	    }
	} while (*buf != '>');
      FREE_MAYBE (s->tag);
      FREE_MAYBE (s->attr);
      s->tag = s->attr = NULL;
      if (!bufsize)
	break;
    }

  FREE_MAYBE (s->tag);
  FREE_MAYBE (s->attr);
  FREE_MAYBE (s->base);
  memset (s, 0, sizeof (*s));	/* just to be sure */
  DEBUGP (("HTML parser ends here (state destroyed).\n"));
  return NULL;
}
Esempio n. 11
0
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
   describing URLs to follow.  When a tag is encountered, extract its
   components (as described by html_allow[] array), and return the
   address and the length of the string.  Return NULL if no URL is
   found.  */
const char *
htmlfindurl (const char *buf, int bufsize, int *size, int init,
             int dash_p_leaf_HTML)
{
    const char *p, *ph;
    state_t    *s = &global_state;

    /* NULL-terminated list of tags and modifiers someone would want to
       follow -- feel free to edit to suit your needs: */
    static struct tag_attr html_allow[] = {
        { "script", "src" },
        { "img", "src" },
        { "img", "href" },
        { "body", "background" },
        { "frame", "src" },
        { "iframe", "src" },
        { "fig", "src" },
        { "overlay", "src" },
        { "applet", "code" },
        { "script", "src" },
        { "embed", "src" },
        { "bgsound", "src" },
        { "img", "lowsrc" },
        { "input", "src" },
        { "layer", "src" },
        { "table", "background"},
        { "th", "background"},
        { "td", "background"},
        /* Tags below this line are treated specially.  */
        { "a", "href" },
        { "area", "href" },
        { "base", "href" },
        { "link", "href" },
        { "link", "rel" },
        { "meta", "content" },
        { NULL, NULL }
    };

    if (init)
    {
        DEBUGP (("Resetting a parser state.\n"));
        memset (s, 0, sizeof (*s));
    }

    while (1)
    {
        const char*  link_href = NULL;
        const char*  link_rel = NULL;
        int          link_href_saved_size = 0; /* init. just to shut up warning */

        if (!bufsize)
            break;
        /* Let's look for a tag, if we are not already in one.  */
        if (!s->at_value)
        {
            /* Find '<'.  */
            if (*buf != '<')
                for (; bufsize && *buf != '<'; ++buf, --bufsize);
            if (!bufsize)
                break;
            /* Skip spaces.  */
            for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                    ++buf, --bufsize);
            if (!bufsize)
                break;
            p = buf;
            /* Find the tag end.  */
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);
            if (!bufsize)
                break;
            if (*buf == '=')
            {
                /* <tag=something> is illegal.  Just skip it.  */
                ++buf, --bufsize;
                continue;
            }
            if (p == buf)
            {
                /* *buf == '>'.  */
                ++buf, --bufsize;
                continue;
            }
            s->tag = strdupdelim (p, buf);
            if (*buf == '>')
            {
                free (s->tag);
                s->tag = NULL;
                ++buf, --bufsize;
                continue;
            }
        }
        else                      /* s->at_value */
        {
            /* Reset AT_VALUE.  */
            s->at_value = 0;
            /* If in quotes, just skip out of them and continue living.  */
            if (s->in_quote)
            {
                s->in_quote = 0;
                for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
                if (!bufsize)
                    break;
                ++buf, --bufsize;
            }
            if (!bufsize)
                break;
            if (*buf == '>')
            {
                FREE_MAYBE (s->tag);
                FREE_MAYBE (s->attr);
                s->tag = s->attr = NULL;
                continue;
            }
        }
        /* Find the attributes.  */
        do
        {
            FREE_MAYBE (s->attr);
            s->attr = NULL;
            if (!bufsize)
                break;
            /* Skip the spaces if we have them.  We don't have them at
               places like <img alt="something"src="something-else">.
                                               ^ no spaces here */
            if (ISSPACE (*buf))
                for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize);
            if (!bufsize || *buf == '>')
                break;
            if (*buf == '=')
            {
                /* This is the case of <tag = something>, which is
                illegal.  Just skip it.  */
                ++buf, --bufsize;
                continue;
            }
            p = buf;
            /* Find the attribute end.  */
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);
            if (!bufsize || *buf == '>')
                break;
            /* Construct the attribute.  */
            s->attr = strdupdelim (p, buf);
            /* Now we must skip the spaces to find '='.  */
            if (*buf != '=')
            {
                for (; bufsize && ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize);
                if (!bufsize || *buf == '>')
                    break;
            }
            /* If we still don't have '=', something is amiss.  */
            if (*buf != '=')
                continue;
            /* Find the beginning of attribute value by skipping the
               spaces.  */
            ++buf, --bufsize;
            for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
            if (!bufsize || *buf == '>')
                break;
            ph = NULL;
            /* The value of an attribute can, but does not have to be
               quoted.  */
            if (*buf == '\"' || *buf == '\'')
            {
                s->in_quote = 1;
                s->quote_char = *buf;
                p = buf + 1;
                for (++buf, --bufsize;
                        bufsize && *buf != s->quote_char && *buf != '\n';
                        ++buf, --bufsize)
                    if (!ph && *buf == '#' && *(buf - 1) != '&')
                        ph = buf;
                if (!bufsize)
                {
                    s->in_quote = 0;
                    break;
                }
                if (*buf == '\n')
                {
                    /* #### Is the following logic good?

                       Obviously no longer in quote.  It might be well
                       to check whether '>' was encountered, but that
                       would be encouraging writers of invalid HTMLs,
                       and we don't want that, now do we?  */
                    s->in_quote = 0;
                    continue;
                }
            }
            else
            {
                p = buf;
                for (; bufsize && !ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize)
                    if (!ph && *buf == '#' && *(buf - 1) != '&')
                        ph = buf;
                if (!bufsize)
                    break;
            }
            /* If '#' was found unprotected in a URI, it is probably an
               HTML marker, or color spec.  */
            *size = (ph ? ph : buf) - p;
            /* The URI is liable to be returned if:
               1) *size != 0;
               2) its tag and attribute are found in html_allow.  */
            if (*size && idmatch (html_allow, s->tag, s->attr))
            {
                if (strcasecmp(s->tag, "a") == EQ ||
                        strcasecmp(s->tag, "area") == EQ)
                {
                    /* Only follow these if we're not at a -p leaf node, as they
                       always link to external documents. */
                    if (!dash_p_leaf_HTML)
                    {
                        s->at_value = 1;
                        return p;
                    }
                }
                else if (!strcasecmp (s->tag, "base") &&
                         !strcasecmp (s->attr, "href"))
                {
                    FREE_MAYBE (s->base);
                    s->base = strdupdelim (p, buf);
                }
                else if (strcasecmp(s->tag, "link") == EQ)
                {
                    if (strcasecmp(s->attr, "href") == EQ)
                    {
                        link_href = p;
                        link_href_saved_size = *size;  /* for restoration below */
                    }
                    else if (strcasecmp(s->attr, "rel") == EQ)
                        link_rel = p;

                    if (link_href != NULL && link_rel != NULL)
                        /* Okay, we've now seen this <LINK> tag's HREF and REL
                           attributes (they may be in either order), so it's now
                           possible to decide if we want to traverse it. */
                        if (!dash_p_leaf_HTML ||
                                strncasecmp(link_rel, "stylesheet",
                                            sizeof("stylesheet") - 1) == EQ)
                            /* In the normal case, all <LINK> tags are fair game.

                            In the special case of when -p is active, however, and
                             we're at a leaf node (relative to the -l max. depth) in
                             the HTML document tree, the only <LINK> tag we'll
                             follow is a <LINK REL="stylesheet">, as it's necessary
                             for displaying this document properly.  We won't follow
                             other <LINK> tags, like <LINK REL="home">, for
                             instance, as they refer to external documents.

                             Note that the above strncasecmp() will incorrectly
                             consider something like '<LINK REL="stylesheet.old"' as
                             equivalent to '<LINK REL="stylesheet"'.  Not really
                             worth the trouble to explicitly check for such cases --
                             if time is spent, it should be spent ripping out wget's
                             somewhat kludgy HTML parser and hooking in a real,
                             componentized one. */
                        {
                            /* When we return, the 'size' IN/OUT parameter
                               determines where in the buffer the end of the current
                               attribute value is.  If REL came after HREF in this
                               <LINK> tag, size is currently set to the size for
                               REL's value -- set it to what it was when we were
                               looking at HREF's value. */
                            *size = link_href_saved_size;

                            s->at_value = 1;
                            return link_href;
                        }
                }
                else if (!strcasecmp (s->tag, "meta") &&
                         !strcasecmp (s->attr, "content"))
                {
                    /* Some pages use a META tag to specify that the page
                       be refreshed by a new page after a given number of
                       seconds.  We need to attempt to extract an URL for
                       the new page from the other garbage present.  The
                       general format for this is:
                       <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">

                       So we just need to skip past the "0; URL="
                       garbage to get to the URL.  META tags are also
                       used for specifying random things like the page
                       author's name and what editor was used to create
                       it.  So we need to be careful to ignore them and
                       not assume that an URL will be present at all.  */
                    for (; *size && ISDIGIT (*p); p++, *size -= 1);
                    if (*p == ';')
                    {
                        for (p++, *size -= 1;
                                *size && ISSPACE (*p);
                                p++, *size -= 1) ;
                        if (!strncasecmp (p, "URL=", 4))
                        {
                            p += 4, *size -= 4;
                            s->at_value = 1;
                            return p;
                        }
                    }
                }
                else
                {
                    s->at_value = 1;
                    return p;
                }
            }
            /* Exit from quote.  */
            if (*buf == s->quote_char)
            {
                s->in_quote = 0;
                ++buf, --bufsize;
            }
        } while (*buf != '>');
        FREE_MAYBE (s->tag);
        FREE_MAYBE (s->attr);
        s->tag = s->attr = NULL;
        if (!bufsize)
            break;
    }

    FREE_MAYBE (s->tag);
    FREE_MAYBE (s->attr);
    FREE_MAYBE (s->base);
    memset (s, 0, sizeof (*s));	/* just to be sure */
    DEBUGP (("HTML parser ends here (state destroyed).\n"));
    return NULL;
}
Esempio n. 12
0
/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URLs stored to a linked list of URLs, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl, dash_p_leaf_HTML = FALSE;
  int meta_disallow_follow;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      /* These three operations need to be done only once per Wget
         run.  They should probably be at a different location.  */
      if (!undesirable_urls)
	undesirable_urls = make_string_hash_table (0);

      hash_table_clear (undesirable_urls);
      string_set_add (undesirable_urls, this_url);
      /* Enter this_url to the hash table, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  string_set_add (undesirable_urls, u->url);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
    /* We've exceeded the maximum recursion depth specified by the user. */
    {
      if (opt.page_requisites && depth <= opt.reclevel + 1)
	/* When -p is specified, we can do one more partial recursion from the
	   "leaf nodes" on the HTML document tree.  The recursion is partial in
	   that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
	   except for <LINK REL="stylesheet">. */
	dash_p_leaf_HTML = TRUE;
      else
	/* Either -p wasn't specified or it was and we've already gone the one
	   extra (pseudo-)level that it affords us, so we need to bail out. */
	{
	  DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
		   depth, opt.reclevel));
	  --depth;
	  return RECLEVELEXC;
	}
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
			    dash_p_leaf_HTML, &meta_disallow_follow);

  if (opt.use_robots && meta_disallow_follow)
    {
      /* The META tag says we are not to follow this file.  Respect
         that.  */
      free_urlpos (url_list);
      url_list = NULL;
    }

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (downloaded_exceeds_quota ())
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded regardless of
	 suffix rules (but that is remedied later with unlink) unless
	 the depth equals the maximum depth.

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 undesirable_urls.  Using it is crucial to avoid unnecessary
	 repeated continuous hits to the hash table.  */
      inl = string_set_contains (undesirable_urls, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !cur_url->link_relative_p)
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
            subject to acc/rej rules, unless a finite maximum depth has
            been specified and the current depth is the maximum depth. */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
                  && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
                      && ((opt.reclevel != INFINITE_RECURSION) &&
			  (depth != opt.reclevel))))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = TOLOWER (*p);
	      xfree (u->url);
	      u->url = str_url (u, 0);
	    }
	  xfree (constr);
	  constr = xstrdup (u->url);
	  string_set_add (undesirable_urls, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		string_set_add (undesirable_urls, constr);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  xfree (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  string_set_add (undesirable_urls, constr);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      xfree (constr);
	      constr = newloc;
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));

	  if (opt.delete_after || (filename && !acceptable (filename)))
	    /* Either --delete-after was specified, or we loaded this otherwise
	       rejected (e.g. by -R) HTML file just so we could harvest its
	       hyperlinks -- in either case, delete the local file. */
	    {
	      DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		       opt.delete_after ? "--delete-after" :
		       "recursive rejection criteria"));
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }

	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      else
	DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links && !opt.delete_after)
    /* This is merely the first pass: the links that have been
       successfully downloaded are converted.  In the second pass,
       convert_all_links() will also convert those links that have NOT
       been downloaded to their canonical form.  */
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  else
    return RETROK;
}
Esempio n. 13
0
/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URL-s stored to a linked list of URL-s, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
  urlpos *url_list, *cur_url;
  char *rfile; /* For robots */
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      ulist = add_slist (ulist, this_url, 0);
      urls_downloaded = NULL;
      urls_html = NULL;
      /* Enter this_url to the slist, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  ulist = add_slist (ulist, u->url, 0);
	  urls_downloaded = add_url (urls_downloaded, u->url, file);
	  urls_html = add_slist (urls_html, file, NOSORT);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      robots_host = NULL;
      forbidden = NULL;
      first_time = 0;
    }
  else
    ++depth;

  /* Bail out if opt.reclevel is exceeded.  */
  if ((opt.reclevel != 0) && (depth > opt.reclevel))
    {
      DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
	       depth, opt.reclevel));
      --depth;
      return RECLEVELEXC;
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (urlproto (this_url) == URLFTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file,
			    canon_this_url ? canon_this_url : this_url, 0);

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (opt.quota && (opt.downloaded > opt.quota))
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      if (u->proto == URLFILE)
	{
	  DEBUGP (("Nothing to do with file:// around here.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded
	 regardless of suffix rules (but that is remedied later with
	 unlink).

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 ulist.  Using it is crucial to avoid the incessant calls to
	 in_slist, which is quite slow.  */
      inl = in_slist (ulist, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->proto != URLFTP)
	if (opt.relative_only && !(cur_url->flags & URELATIVE))
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    ulist = add_slist (ulist, constr, 0);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->proto == URLFTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
	     subject to acc/rej rules.  That's why the `!'.  */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
		   && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  ulist = add_slist (ulist, constr, 0);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = tolower (*p);
	      free (u->url);
	      u->url = str_url (u, 0);
	    }
	  free (constr);
	  constr = xstrdup (u->url);
	  inl = in_slist (ulist, constr);
	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		ulist = add_slist (ulist, constr, 0);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->proto == URLHTTP)
	{
	  /* Since Wget knows about only one set of robot rules at a
	     time, /robots.txt must be reloaded whenever a new host is
	     accessed.

	     robots_host holds the host the current `forbid' variable
	     is assigned to.  */
	  if (!robots_host || !same_host (robots_host, u->host))
	    {
	      FREE_MAYBE (robots_host);
	      /* Now make robots_host the new host, no matter what the
		 result will be.  So if there is no /robots.txt on the
		 site, Wget will not retry getting robots all the
		 time.  */
	      robots_host = xstrdup (u->host);
	      free_vec (forbidden);
	      forbidden = NULL;
	      err = retrieve_robots (constr, ROBOTS_FILENAME);
	      if (err == ROBOTSOK)
		{
		  rurl = robots_url (constr, ROBOTS_FILENAME);
		  rfile = url_filename (rurl);
		  forbidden = parse_robots (rfile);
		  freeurl (rurl, 1);
		  free (rfile);
		}
	    }

	  /* Now that we have (or don't have) robots, we can check for
	     them.  */
	  if (!robots_match (u, forbidden))
	    {
	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
		       ROBOTS_FILENAME));
	      ulist = add_slist (ulist, constr, 0);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  ulist = add_slist (ulist, constr, 0);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->proto == URLFTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->proto == URLFTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      free (constr);
	      constr = newloc;
	    }
	  /* In case of convert_links: If there was no error, add it to
	     the list of downloaded URLs.  We might need it for
	     conversion.  */
	  if (opt.convert_links && filename)
	    {
	      if (dt & RETROKF)
		{
		  urls_downloaded = add_url (urls_downloaded, constr, filename);
		  /* If the URL is HTML, note it.  */
		  if (dt & TEXTHTML)
		    urls_html = add_slist (urls_html, filename, NOSORT);
		}
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));
	  /* If an suffix-rejected file was loaded only because it was HTML,
	     undo the error now */
	  if (opt.delete_after || (filename && !acceptable (filename)))
	    {
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }
	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->flags |= UABS2REL;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links)
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (opt.quota && (opt.downloaded > opt.quota))
    return QUOTEXC;
  else
    return RETROK;
}
Esempio n. 14
0
File: retr.c Progetto: aosm/wget
/* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
   or simply copy it with file:// (#### the latter not yet
   implemented!).  */
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
	      const char *refurl, int *dt)
{
  uerr_t result;
  char *url;
  int location_changed, already_redirected, dummy;
  int local_use_proxy;
  char *mynewloc, *proxy;
  struct urlinfo *u;


  /* If dt is NULL, just ignore it.  */
  if (!dt)
    dt = &dummy;
  url = xstrdup (origurl);
  if (newloc)
    *newloc = NULL;
  if (file)
    *file = NULL;
  already_redirected = 0;

 again:
  u = newurl ();
  /* Parse the URL.  RFC2068 requires `Location' to contain an
     absoluteURI, but many sites break this requirement.  #### We
     should be liberal and accept a relative location, too.  */
  result = parseurl (url, u, already_redirected);
  if (result != URLOK)
    {
      freeurl (u, 1);
      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
      return result;
    }

  /* Set the referer.  */
  if (refurl)
    u->referer = xstrdup (refurl);
  else
    u->referer = NULL;

  local_use_proxy = USE_PROXY_P (u);
  if (local_use_proxy)
    {
      struct urlinfo *pu = newurl ();

      /* Copy the original URL to new location.  */
      memcpy (pu, u, sizeof (*u));
      pu->proxy = NULL; /* A minor correction :) */
      /* Initialize u to nil.  */
      memset (u, 0, sizeof (*u));
      u->proxy = pu;
      /* Get the appropriate proxy server, appropriate for the
	 current protocol.  */
      proxy = getproxy (pu->proto);
      if (!proxy)
	{
	  logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
	  freeurl (u, 1);
	  return PROXERR;
	}
      /* Parse the proxy URL.  */
      result = parseurl (proxy, u, 0);
      if (result != URLOK || u->proto != URLHTTP)
	{
	  if (u->proto == URLHTTP)
	    logprintf (LOG_NOTQUIET, "Proxy %s: %s.\n", proxy, uerrmsg (result));
	  else
	    logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
	  freeurl (u, 1);
	  return PROXERR;
	}
      u->proto = URLHTTP;
    }

  assert (u->proto != URLFILE);	/* #### Implement me!  */
  mynewloc = NULL;

  if (u->proto == URLHTTP)
    result = http_loop (u, &mynewloc, dt);
  else if (u->proto == URLFTP)
    {
      /* If this is a redirection, we must not allow recursive FTP
	 retrieval, so we save recursion to oldrec, and restore it
	 later.  */
      int oldrec = opt.recursive;
      if (already_redirected)
	opt.recursive = 0;
      result = ftp_loop (u, dt);
      opt.recursive = oldrec;
      /* There is a possibility of having HTTP being redirected to
	 FTP.  In these cases we must decide whether the text is HTML
	 according to the suffix.  The HTML suffixes are `.html' and
	 `.htm', case-insensitive.

	 #### All of this is, of course, crap.  These types should be
	 determined through mailcap.  */
      if (already_redirected && u->local && (u->proto == URLFTP ))
	{
	  char *suf = suffix (u->local);
	  if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
	    *dt |= TEXTHTML;
	  FREE_MAYBE (suf);
	}
    }
  location_changed = (result == NEWLOCATION);
  if (location_changed)
    {
      /* Check for redirection to oneself.  */
      if (url_equal (url, mynewloc))
	{
	  logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
		     mynewloc);
	  return WRONGCODE;
	}
      if (mynewloc)
	{
	  free (url);
	  url = mynewloc;
	}
      freeurl (u, 1);
      already_redirected = 1;
      goto again;
    }
  if (file)
    {
      if (u->local)
	*file = xstrdup (u->local);
      else
	*file = NULL;
    }
  freeurl (u, 1);

  if (newloc)
    *newloc = url;
  else
    free (url);

  return result;
}