Esempio n. 1
0
int add_edge (struct Plex *plex, long v0, long v1)
{
	int sgn, j;
	long idns[4], edns[4];
	struct Glass *eglass;
	idns[0] = v0; idns[1] = v1; idns[2] = 0; idns[3] = 0;
	eglass = lookupGlass (plex, PolyEdge, idns);
	if (eglass == NULL) return (0);
	for (j = 0; j < 4; j++)
		edns[j] = eglass -> idnumbers[j];
	sgn = idmatch (PolyEdge, idns, edns);
	if (sgn == 0) return (0);
	eglass -> coefficient += sgn;
	return (1);
}
Esempio n. 2
0
struct Glass *lookupGlass (struct Plex *plex, enum PlexType type, long idnumbers[4])
{
	int j, result;
	long h, maxhash;
	long gdnumbers[4];
	struct Glass *glass;
	struct Plexi *plexi;
	
	maxhash = plex -> maxhash;
	h = hashGlass (idnumbers, maxhash);
	plexi = plex -> plexis + ((long) type) * maxhash + h;
	for (glass = plexi -> head; glass != NULL; glass = glass -> next) {
		for (j = 0; j < 4; j++)
			gdnumbers[j] = glass -> idnumbers[j];
		result = idmatch (type, gdnumbers, idnumbers);
		if (result != 0) return (glass);
	}
	return (NULL);
}
Esempio n. 3
0
/*History:
2015-01-29 17:28 初步看了下,没有例子,不知道实际解析时的情况,比较糊涂。最好是自己写例子测试一下的。另外看起来得了解一点html语法。

*/
const char *
htmlfindurl(const char *buf, int bufsize, int *size, int init)
{
    const char *p, *ph;
    state_t *s;

    /* NULL-terminated list of tags and modifiers someone would want to
       follow -- feel free to edit to suit your needs: */
    //z 允许的 html tags;声明为static ,只要一份即可。
    static struct tag_attr html_allow[] =
    {
        //z tag : attr 值对
        { "a", "href" },
        { "img", "src" },
        { "img", "href" },
        { "body", "background" },
        { "frame", "src" },
        { "iframe", "src" },
        { "fig", "src" },
        { "overlay", "src" },
        { "applet", "code" },
        { "script", "src" },
        { "embed", "src" },
        { "bgsound", "src" },
        { "area", "href" },
        { "img", "lowsrc" },
        { "input", "src" },
        { "layer", "src" },
        { "table", "background"},
        { "th", "background"},
        { "td", "background"},
        /* Tags below this line are treated specially.  */
        { "base", "href" },
        { "meta", "content" },
        { NULL, NULL }//z 最后以NULL作为结尾
    };

    s = &global_state;

    if (init)
    {
        DEBUGP (("Resetting a parser state.\n"));
        memset (s, 0, sizeof (*s));
    }

    while (1)
    {
        //z 如果 bufsize 为0,跳出循环
        if (!bufsize)
            break;

        /* Let's look for a tag, if we are not already in one.  */
        //z 首先寻找 tag
        if (!s->at_value)
        {
            /* Find '<'.  */
            //z 找到 <
            if (*buf != '<')
                for (; bufsize && *buf != '<'; ++buf, --bufsize);

            //z 如果 bufsize 为0 ,那么到达了结尾
            if (!bufsize)
                break;

            /* Skip spaces.  */
            //z 在处理的时候,跳过空格
            for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                    ++buf, --bufsize);

            if (!bufsize)
                break;

            p = buf;

            /* Find the tag end.  */
            //z 直到找到空格或者找到 >,或者找到 =,或者到达结尾。
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);

            if (!bufsize)
                break;

            //z 如果找到了 =
            if (*buf == '=')
            {
                /* <tag=something> is illegal.  Just skip it.  */
                ++buf, --bufsize;

                continue;
            }

            if (p == buf)
            {
                /* *buf == '>'.  */
                ++buf, --bufsize;

                continue;
            }

            s->tag = strdupdelim (p, buf);

            if (*buf == '>')
            {
                free (s->tag);
                s->tag = NULL;
                ++buf, --bufsize;
                continue;
            }
        }
        else                      /* s->at_value */
        {
            //z 这意思是在查找 value 。
            /* Reset AT_VALUE.  */
            s->at_value = 0;
            /* If in quotes, just skip out of them and continue living.  */
            if (s->in_quote)
            {
                s->in_quote = 0;
                for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);

                if (!bufsize)
                    break;
                ++buf, --bufsize;
            }

            if (!bufsize)
                break;

            if (*buf == '>')
            {
                FREE_MAYBE (s->tag);
                FREE_MAYBE (s->attr);
                s->tag = s->attr = NULL;
                continue;
            }
        }

        /* Find the attributes.  */
        do
        {
            FREE_MAYBE (s->attr);
            s->attr = NULL;

            if (!bufsize)
                break;
            /* Skip the spaces if we have them.  We don't have them at
            places like <img alt="something"src="something-else">.
            ^ no spaces here */

            if (ISSPACE (*buf))
                for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize);

            if (!bufsize || *buf == '>')
                break;

            if (*buf == '=')
            {
                /* This is the case of <tag = something>, which is
                  illegal.  Just skip it.  */
                ++buf, --bufsize;
                continue;
            }

            p = buf;
            /* Find the attribute end.  */
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);

            if (!bufsize || *buf == '>')
                break;

            //z 找到这其间的值为 attr 。
            /* Construct the attribute.  */
            s->attr = strdupdelim (p, buf);
            /* Now we must skip the spaces to find '='.  */
            if (*buf != '=')
            {
                for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
                if (!bufsize || *buf == '>')
                    break;
            }

            /* If we still don't have '=', something is amiss.  */
            //z 是否找到了 = ,如果没有找到 = ,可能出现了错误。
            if (*buf != '=')
                continue;

            /* Find the beginning of attribute value by skipping the
            spaces.  */
            ++buf, --bufsize;
            //z 越过若干个空白字符。
            for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
            //z 是否结束或者找到了‘>’
            if (!bufsize || *buf == '>')
                break;
            ph = NULL;
            /* The value of an attribute can, but does not have to be
            quoted.  */
            //z 如果当前字符为 ' 或者 " , 进入引号状态
            if (*buf == '\"' || *buf == '\'')
            {
                //z 进入 quote 状态
                s->in_quote = 1;
                //z 记住当前的字符串,方便寻找到下一个做比对。
                s->quote_char = *buf;
                //z p 指向引号内第一个字符
                p = buf + 1;
                //z 步进,直到找到另一个引号,或者遇到了回车
                for (++buf, --bufsize;
                        bufsize && *buf != s->quote_char && *buf != '\n';
                        ++buf, --bufsize)

                    //z 如果当前字符串为 # , 记录下其位置
                    if (*buf == '#')
                        ph = buf;
                if (!bufsize)
                {
                    //z 如果到达了字符结尾,结束 in_quote 状态
                    s->in_quote = 0;
                    break;
                }
                //z 如果遇到了 '\n' ,继续下一轮。
                if (*buf == '\n')
                {
                    /* #### Is the following logic good?

                     Obviously no longer in quote.  It might be well
                     to check whether '>' was encountered, but that
                     would be encouraging writers of invalid HTMLs,
                      and we don't want that, now do we?  */
                    s->in_quote = 0;
                    continue;
                }
            }
            else
            {
                p = buf;

                for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
                    if (*buf == '#')
                        ph = buf;
                if (!bufsize)
                    break;
            }

            //z URI 中的# ( found unprotected 是什么意思? ) # 可能表示的意思是一个 html marker 或者 color spec 。
            /* If '#' was found unprotected in a URI, it is probably an
            HTML marker, or color spec.  */
            //z 如果有 # ,那么将 ph 视作结束?
            *size = (ph ? ph : buf) - p;
            /* The URI is liable to be returned if:
            1) *size != 0;
            2) its tag and attribute are found in html_allow.  */
            //z 实际可能表示的例子有 : <a href="http://www.w3school.com.cn/">Visit W3School</a> 这个样子
            if (*size && idmatch (html_allow, s->tag, s->attr))
            {
                if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
                {
                    FREE_MAYBE (s->base);
                    s->base = strdupdelim (p, buf);
                }
                //z 比对 meta 和 content
                else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
                {
                    /* Some pages use a META tag to specify that the page
                    be refreshed by a new page after a given number of
                    seconds.  We need to attempt to extract an URL for
                    the new page from the other garbage present.  The
                    general format for this is:
                    <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">

                     So we just need to skip past the "0; URL="
                     garbage to get to the URL.  META tags are also
                     used for specifying random things like the page
                     author's name and what editor was used to create
                     it.  So we need to be careful to ignore them and
                      not assume that an URL will be present at all.  */
                    //z 只要是数字,那么持续向前
                    for (; *size && ISDIGIT (*p); p++, *size -= 1);

                    //z 查看是否会遇到 ;
                    if (*p == ';')
                    {
                        //z 跳过 space。
                        for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
                        //z 比对,是否找到了 URL,
                        if (!strncasecmp (p, "URL=", 4))
                        {
                            //z 如果在 meta 中找到了 URL
                            p += 4, *size -= 4;
                            s->at_value = 1;
                            //z 这意思是直接返回 p?
                            return p;
                        }
                    }
                }
                else
                {
                    s->at_value = 1;
                    return p;
                }
            }

            /* Exit from quote.  */
            if (*buf == s->quote_char)
            {
                s->in_quote = 0;
                ++buf, --bufsize;
            }
        }
        while (*buf != '>');

        FREE_MAYBE (s->tag);
        FREE_MAYBE (s->attr);
        s->tag = s->attr = NULL;

        if (!bufsize)
            break;
    }

    FREE_MAYBE (s->tag);
    FREE_MAYBE (s->attr);
    FREE_MAYBE (s->base);

    memset (s, 0, sizeof (*s));	/* just to be sure */
    DEBUGP (("HTML parser ends here (state destroyed).\n"));

    return NULL;
}
Esempio n. 4
0
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
   describing URLs to follow.  When a tag is encountered, extract its
   components (as described by html_allow[] array), and return the
   address and the length of the string.  Return NULL if no URL is
   found.  */
const char *
htmlfindurl (const char *buf, int bufsize, int *size, int init)
{
  const char *p, *ph;
  state_t *s;
  /* NULL-terminated list of tags and modifiers someone would want to
     follow -- feel free to edit to suit your needs: */
  static struct tag_attr html_allow[] = {
    { "a", "href" },
    { "img", "src" },
    { "img", "href" },
    { "body", "background" },
    { "frame", "src" },
    { "iframe", "src" },
    { "fig", "src" },
    { "overlay", "src" },
    { "applet", "code" },
    { "script", "src" },
    { "embed", "src" },
    { "bgsound", "src" },
    { "area", "href" },
    { "img", "lowsrc" },
    { "input", "src" },
    { "layer", "src" },
    { "table", "background"},
    { "th", "background"},
    { "td", "background"},
    /* Tags below this line are treated specially.  */
    { "base", "href" },
    { "meta", "content" },
    { NULL, NULL }
  };

  s = &global_state;

  if (init)
    {
      DEBUGP (("Resetting a parser state.\n"));
      memset (s, 0, sizeof (*s));
    }

  while (1)
    {
      if (!bufsize)
	break;
      /* Let's look for a tag, if we are not already in one.  */
      if (!s->at_value)
	{
	  /* Find '<'.  */
	  if (*buf != '<')
	    for (; bufsize && *buf != '<'; ++buf, --bufsize);
	  if (!bufsize)
	    break;
	  /* Skip spaces.  */
	  for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
	       ++buf, --bufsize);
	  if (!bufsize)
	    break;
	  p = buf;
	  /* Find the tag end.  */
	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
	       ++buf, --bufsize);
	  if (!bufsize)
	    break;
	  if (*buf == '=')
	    {
	      /* <tag=something> is illegal.  Just skip it.  */
	      ++buf, --bufsize;
	      continue;
	    }
	  if (p == buf)
	    {
	      /* *buf == '>'.  */
	      ++buf, --bufsize;
	      continue;
	    }
	  s->tag = strdupdelim (p, buf);
	  if (*buf == '>')
	    {
	      free (s->tag);
	      s->tag = NULL;
	      ++buf, --bufsize;
	      continue;
	    }
	}
      else                      /* s->at_value */
	{
	  /* Reset AT_VALUE.  */
	  s->at_value = 0;
	  /* If in quotes, just skip out of them and continue living.  */
	  if (s->in_quote)
	    {
	      s->in_quote = 0;
	      for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
	      if (!bufsize)
		break;
	      ++buf, --bufsize;
	    }
	  if (!bufsize)
	    break;
	  if (*buf == '>')
	    {
	      FREE_MAYBE (s->tag);
	      FREE_MAYBE (s->attr);
	      s->tag = s->attr = NULL;
	      continue;
	    }
	}
      /* Find the attributes.  */
      do
	{
	  FREE_MAYBE (s->attr);
	  s->attr = NULL;
	  if (!bufsize)
	    break;
	  /* Skip the spaces if we have them.  We don't have them at
	     places like <img alt="something"src="something-else">.
	                                     ^ no spaces here */
	  if (ISSPACE (*buf))
	    for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
		 ++buf, --bufsize);
	  if (!bufsize || *buf == '>')
	    break;
	  if (*buf == '=')
	    {
	      /* This is the case of <tag = something>, which is
		 illegal.  Just skip it.  */
	      ++buf, --bufsize;
	      continue;
	    }
	  p = buf;
	  /* Find the attribute end.  */
	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
	       ++buf, --bufsize);
	  if (!bufsize || *buf == '>')
	    break;
	  /* Construct the attribute.  */
	  s->attr = strdupdelim (p, buf);
	  /* Now we must skip the spaces to find '='.  */
	  if (*buf != '=')
	    {
	      for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
	      if (!bufsize || *buf == '>')
		break;
	    }
	  /* If we still don't have '=', something is amiss.  */
	  if (*buf != '=')
	    continue;
	  /* Find the beginning of attribute value by skipping the
	     spaces.  */
	  ++buf, --bufsize;
	  for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
	  if (!bufsize || *buf == '>')
	    break;
	  ph = NULL;
	  /* The value of an attribute can, but does not have to be
	     quoted.  */
	  if (*buf == '\"' || *buf == '\'')
	    {
	      s->in_quote = 1;
	      s->quote_char = *buf;
	      p = buf + 1;
	      for (++buf, --bufsize;
		   bufsize && *buf != s->quote_char && *buf != '\n';
		   ++buf, --bufsize)
		if (*buf == '#')
		  ph = buf;
	      if (!bufsize)
		{
		  s->in_quote = 0;
		  break;
		}
	      if (*buf == '\n')
		{
		  /* #### Is the following logic good?

		     Obviously no longer in quote.  It might be well
		     to check whether '>' was encountered, but that
		     would be encouraging writers of invalid HTMLs,
		     and we don't want that, now do we?  */
		  s->in_quote = 0;
		  continue;
		}
	    }
	  else
	    {
	      p = buf;
	      for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
		if (*buf == '#')
		  ph = buf;
	      if (!bufsize)
		break;
	    }
	  /* If '#' was found unprotected in a URI, it is probably an
	     HTML marker, or color spec.  */
	  *size = (ph ? ph : buf) - p;
	  /* The URI is liable to be returned if:
	     1) *size != 0;
	     2) its tag and attribute are found in html_allow.  */
	  if (*size && idmatch (html_allow, s->tag, s->attr))
	    {
	      if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
		{
		  FREE_MAYBE (s->base);
		  s->base = strdupdelim (p, buf);
		}
	      else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
		{
		  /* Some pages use a META tag to specify that the page
		     be refreshed by a new page after a given number of
		     seconds.  We need to attempt to extract an URL for
		     the new page from the other garbage present.  The
		     general format for this is:                  
		     <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">

		     So we just need to skip past the "0; URL="
		     garbage to get to the URL.  META tags are also
		     used for specifying random things like the page
		     author's name and what editor was used to create
		     it.  So we need to be careful to ignore them and
		     not assume that an URL will be present at all.  */
		  for (; *size && ISDIGIT (*p); p++, *size -= 1);
		  if (*p == ';')
		    {
		      for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
		      if (!strncasecmp (p, "URL=", 4))
			{
			  p += 4, *size -= 4;
			  s->at_value = 1;
			  return p;
			}
		    }
		}
	      else
		{
		  s->at_value = 1;
		  return p;
		}
	    }
	  /* Exit from quote.  */
	  if (*buf == s->quote_char)
	    {
	      s->in_quote = 0;
	      ++buf, --bufsize;
	    }
	} while (*buf != '>');
      FREE_MAYBE (s->tag);
      FREE_MAYBE (s->attr);
      s->tag = s->attr = NULL;
      if (!bufsize)
	break;
    }

  FREE_MAYBE (s->tag);
  FREE_MAYBE (s->attr);
  FREE_MAYBE (s->base);
  memset (s, 0, sizeof (*s));	/* just to be sure */
  DEBUGP (("HTML parser ends here (state destroyed).\n"));
  return NULL;
}
Esempio n. 5
0
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
   describing URLs to follow.  When a tag is encountered, extract its
   components (as described by html_allow[] array), and return the
   address and the length of the string.  Return NULL if no URL is
   found.  */
const char *
htmlfindurl (const char *buf, int bufsize, int *size, int init,
             int dash_p_leaf_HTML)
{
    const char *p, *ph;
    state_t    *s = &global_state;

    /* NULL-terminated list of tags and modifiers someone would want to
       follow -- feel free to edit to suit your needs: */
    static struct tag_attr html_allow[] = {
        { "script", "src" },
        { "img", "src" },
        { "img", "href" },
        { "body", "background" },
        { "frame", "src" },
        { "iframe", "src" },
        { "fig", "src" },
        { "overlay", "src" },
        { "applet", "code" },
        { "script", "src" },
        { "embed", "src" },
        { "bgsound", "src" },
        { "img", "lowsrc" },
        { "input", "src" },
        { "layer", "src" },
        { "table", "background"},
        { "th", "background"},
        { "td", "background"},
        /* Tags below this line are treated specially.  */
        { "a", "href" },
        { "area", "href" },
        { "base", "href" },
        { "link", "href" },
        { "link", "rel" },
        { "meta", "content" },
        { NULL, NULL }
    };

    if (init)
    {
        DEBUGP (("Resetting a parser state.\n"));
        memset (s, 0, sizeof (*s));
    }

    while (1)
    {
        const char*  link_href = NULL;
        const char*  link_rel = NULL;
        int          link_href_saved_size = 0; /* init. just to shut up warning */

        if (!bufsize)
            break;
        /* Let's look for a tag, if we are not already in one.  */
        if (!s->at_value)
        {
            /* Find '<'.  */
            if (*buf != '<')
                for (; bufsize && *buf != '<'; ++buf, --bufsize);
            if (!bufsize)
                break;
            /* Skip spaces.  */
            for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                    ++buf, --bufsize);
            if (!bufsize)
                break;
            p = buf;
            /* Find the tag end.  */
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);
            if (!bufsize)
                break;
            if (*buf == '=')
            {
                /* <tag=something> is illegal.  Just skip it.  */
                ++buf, --bufsize;
                continue;
            }
            if (p == buf)
            {
                /* *buf == '>'.  */
                ++buf, --bufsize;
                continue;
            }
            s->tag = strdupdelim (p, buf);
            if (*buf == '>')
            {
                free (s->tag);
                s->tag = NULL;
                ++buf, --bufsize;
                continue;
            }
        }
        else                      /* s->at_value */
        {
            /* Reset AT_VALUE.  */
            s->at_value = 0;
            /* If in quotes, just skip out of them and continue living.  */
            if (s->in_quote)
            {
                s->in_quote = 0;
                for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
                if (!bufsize)
                    break;
                ++buf, --bufsize;
            }
            if (!bufsize)
                break;
            if (*buf == '>')
            {
                FREE_MAYBE (s->tag);
                FREE_MAYBE (s->attr);
                s->tag = s->attr = NULL;
                continue;
            }
        }
        /* Find the attributes.  */
        do
        {
            FREE_MAYBE (s->attr);
            s->attr = NULL;
            if (!bufsize)
                break;
            /* Skip the spaces if we have them.  We don't have them at
               places like <img alt="something"src="something-else">.
                                               ^ no spaces here */
            if (ISSPACE (*buf))
                for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize);
            if (!bufsize || *buf == '>')
                break;
            if (*buf == '=')
            {
                /* This is the case of <tag = something>, which is
                illegal.  Just skip it.  */
                ++buf, --bufsize;
                continue;
            }
            p = buf;
            /* Find the attribute end.  */
            for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
                    ++buf, --bufsize);
            if (!bufsize || *buf == '>')
                break;
            /* Construct the attribute.  */
            s->attr = strdupdelim (p, buf);
            /* Now we must skip the spaces to find '='.  */
            if (*buf != '=')
            {
                for (; bufsize && ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize);
                if (!bufsize || *buf == '>')
                    break;
            }
            /* If we still don't have '=', something is amiss.  */
            if (*buf != '=')
                continue;
            /* Find the beginning of attribute value by skipping the
               spaces.  */
            ++buf, --bufsize;
            for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
            if (!bufsize || *buf == '>')
                break;
            ph = NULL;
            /* The value of an attribute can, but does not have to be
               quoted.  */
            if (*buf == '\"' || *buf == '\'')
            {
                s->in_quote = 1;
                s->quote_char = *buf;
                p = buf + 1;
                for (++buf, --bufsize;
                        bufsize && *buf != s->quote_char && *buf != '\n';
                        ++buf, --bufsize)
                    if (!ph && *buf == '#' && *(buf - 1) != '&')
                        ph = buf;
                if (!bufsize)
                {
                    s->in_quote = 0;
                    break;
                }
                if (*buf == '\n')
                {
                    /* #### Is the following logic good?

                       Obviously no longer in quote.  It might be well
                       to check whether '>' was encountered, but that
                       would be encouraging writers of invalid HTMLs,
                       and we don't want that, now do we?  */
                    s->in_quote = 0;
                    continue;
                }
            }
            else
            {
                p = buf;
                for (; bufsize && !ISSPACE (*buf) && *buf != '>';
                        ++buf, --bufsize)
                    if (!ph && *buf == '#' && *(buf - 1) != '&')
                        ph = buf;
                if (!bufsize)
                    break;
            }
            /* If '#' was found unprotected in a URI, it is probably an
               HTML marker, or color spec.  */
            *size = (ph ? ph : buf) - p;
            /* The URI is liable to be returned if:
               1) *size != 0;
               2) its tag and attribute are found in html_allow.  */
            if (*size && idmatch (html_allow, s->tag, s->attr))
            {
                if (strcasecmp(s->tag, "a") == EQ ||
                        strcasecmp(s->tag, "area") == EQ)
                {
                    /* Only follow these if we're not at a -p leaf node, as they
                       always link to external documents. */
                    if (!dash_p_leaf_HTML)
                    {
                        s->at_value = 1;
                        return p;
                    }
                }
                else if (!strcasecmp (s->tag, "base") &&
                         !strcasecmp (s->attr, "href"))
                {
                    FREE_MAYBE (s->base);
                    s->base = strdupdelim (p, buf);
                }
                else if (strcasecmp(s->tag, "link") == EQ)
                {
                    if (strcasecmp(s->attr, "href") == EQ)
                    {
                        link_href = p;
                        link_href_saved_size = *size;  /* for restoration below */
                    }
                    else if (strcasecmp(s->attr, "rel") == EQ)
                        link_rel = p;

                    if (link_href != NULL && link_rel != NULL)
                        /* Okay, we've now seen this <LINK> tag's HREF and REL
                           attributes (they may be in either order), so it's now
                           possible to decide if we want to traverse it. */
                        if (!dash_p_leaf_HTML ||
                                strncasecmp(link_rel, "stylesheet",
                                            sizeof("stylesheet") - 1) == EQ)
                            /* In the normal case, all <LINK> tags are fair game.

                            In the special case of when -p is active, however, and
                             we're at a leaf node (relative to the -l max. depth) in
                             the HTML document tree, the only <LINK> tag we'll
                             follow is a <LINK REL="stylesheet">, as it's necessary
                             for displaying this document properly.  We won't follow
                             other <LINK> tags, like <LINK REL="home">, for
                             instance, as they refer to external documents.

                             Note that the above strncasecmp() will incorrectly
                             consider something like '<LINK REL="stylesheet.old"' as
                             equivalent to '<LINK REL="stylesheet"'.  Not really
                             worth the trouble to explicitly check for such cases --
                             if time is spent, it should be spent ripping out wget's
                             somewhat kludgy HTML parser and hooking in a real,
                             componentized one. */
                        {
                            /* When we return, the 'size' IN/OUT parameter
                               determines where in the buffer the end of the current
                               attribute value is.  If REL came after HREF in this
                               <LINK> tag, size is currently set to the size for
                               REL's value -- set it to what it was when we were
                               looking at HREF's value. */
                            *size = link_href_saved_size;

                            s->at_value = 1;
                            return link_href;
                        }
                }
                else if (!strcasecmp (s->tag, "meta") &&
                         !strcasecmp (s->attr, "content"))
                {
                    /* Some pages use a META tag to specify that the page
                       be refreshed by a new page after a given number of
                       seconds.  We need to attempt to extract an URL for
                       the new page from the other garbage present.  The
                       general format for this is:
                       <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">

                       So we just need to skip past the "0; URL="
                       garbage to get to the URL.  META tags are also
                       used for specifying random things like the page
                       author's name and what editor was used to create
                       it.  So we need to be careful to ignore them and
                       not assume that an URL will be present at all.  */
                    for (; *size && ISDIGIT (*p); p++, *size -= 1);
                    if (*p == ';')
                    {
                        for (p++, *size -= 1;
                                *size && ISSPACE (*p);
                                p++, *size -= 1) ;
                        if (!strncasecmp (p, "URL=", 4))
                        {
                            p += 4, *size -= 4;
                            s->at_value = 1;
                            return p;
                        }
                    }
                }
                else
                {
                    s->at_value = 1;
                    return p;
                }
            }
            /* Exit from quote.  */
            if (*buf == s->quote_char)
            {
                s->in_quote = 0;
                ++buf, --bufsize;
            }
        } while (*buf != '>');
        FREE_MAYBE (s->tag);
        FREE_MAYBE (s->attr);
        s->tag = s->attr = NULL;
        if (!bufsize)
            break;
    }

    FREE_MAYBE (s->tag);
    FREE_MAYBE (s->attr);
    FREE_MAYBE (s->base);
    memset (s, 0, sizeof (*s));	/* just to be sure */
    DEBUGP (("HTML parser ends here (state destroyed).\n"));
    return NULL;
}