int add_edge (struct Plex *plex, long v0, long v1) { int sgn, j; long idns[4], edns[4]; struct Glass *eglass; idns[0] = v0; idns[1] = v1; idns[2] = 0; idns[3] = 0; eglass = lookupGlass (plex, PolyEdge, idns); if (eglass == NULL) return (0); for (j = 0; j < 4; j++) edns[j] = eglass -> idnumbers[j]; sgn = idmatch (PolyEdge, idns, edns); if (sgn == 0) return (0); eglass -> coefficient += sgn; return (1); }
struct Glass *lookupGlass (struct Plex *plex, enum PlexType type, long idnumbers[4]) { int j, result; long h, maxhash; long gdnumbers[4]; struct Glass *glass; struct Plexi *plexi; maxhash = plex -> maxhash; h = hashGlass (idnumbers, maxhash); plexi = plex -> plexis + ((long) type) * maxhash + h; for (glass = plexi -> head; glass != NULL; glass = glass -> next) { for (j = 0; j < 4; j++) gdnumbers[j] = glass -> idnumbers[j]; result = idmatch (type, gdnumbers, idnumbers); if (result != 0) return (glass); } return (NULL); }
/*History: 2015-01-29 17:28 初步看了下,没有例子,不知道实际解析时的情况,比较糊涂。最好是自己写例子测试一下的。另外看起来得了解一点html语法。 */ const char * htmlfindurl(const char *buf, int bufsize, int *size, int init) { const char *p, *ph; state_t *s; /* NULL-terminated list of tags and modifiers someone would want to follow -- feel free to edit to suit your needs: */ //z 允许的 html tags;声明为static ,只要一份即可。 static struct tag_attr html_allow[] = { //z tag : attr 值对 { "a", "href" }, { "img", "src" }, { "img", "href" }, { "body", "background" }, { "frame", "src" }, { "iframe", "src" }, { "fig", "src" }, { "overlay", "src" }, { "applet", "code" }, { "script", "src" }, { "embed", "src" }, { "bgsound", "src" }, { "area", "href" }, { "img", "lowsrc" }, { "input", "src" }, { "layer", "src" }, { "table", "background"}, { "th", "background"}, { "td", "background"}, /* Tags below this line are treated specially. */ { "base", "href" }, { "meta", "content" }, { NULL, NULL }//z 最后以NULL作为结尾 }; s = &global_state; if (init) { DEBUGP (("Resetting a parser state.\n")); memset (s, 0, sizeof (*s)); } while (1) { //z 如果 bufsize 为0,跳出循环 if (!bufsize) break; /* Let's look for a tag, if we are not already in one. */ //z 首先寻找 tag if (!s->at_value) { /* Find '<'. */ //z 找到 < if (*buf != '<') for (; bufsize && *buf != '<'; ++buf, --bufsize); //z 如果 bufsize 为0 ,那么到达了结尾 if (!bufsize) break; /* Skip spaces. */ //z 在处理的时候,跳过空格 for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize) break; p = buf; /* Find the tag end. */ //z 直到找到空格或者找到 >,或者找到 =,或者到达结尾。 for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; ++buf, --bufsize); if (!bufsize) break; //z 如果找到了 = if (*buf == '=') { /* <tag=something> is illegal. Just skip it. */ ++buf, --bufsize; continue; } if (p == buf) { /* *buf == '>'. */ ++buf, --bufsize; continue; } s->tag = strdupdelim (p, buf); if (*buf == '>') { free (s->tag); s->tag = NULL; ++buf, --bufsize; continue; } } else /* s->at_value */ { //z 这意思是在查找 value 。 /* Reset AT_VALUE. */ s->at_value = 0; /* If in quotes, just skip out of them and continue living. */ if (s->in_quote) { s->in_quote = 0; for (; bufsize && *buf != s->quote_char; ++buf, --bufsize); if (!bufsize) break; ++buf, --bufsize; } if (!bufsize) break; if (*buf == '>') { FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); s->tag = s->attr = NULL; continue; } } /* Find the attributes. */ do { FREE_MAYBE (s->attr); s->attr = NULL; if (!bufsize) break; /* Skip the spaces if we have them. We don't have them at places like <img alt="something"src="something-else">. ^ no spaces here */ if (ISSPACE (*buf)) for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; if (*buf == '=') { /* This is the case of <tag = something>, which is illegal. Just skip it. */ ++buf, --bufsize; continue; } p = buf; /* Find the attribute end. */ for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; ++buf, --bufsize); if (!bufsize || *buf == '>') break; //z 找到这其间的值为 attr 。 /* Construct the attribute. */ s->attr = strdupdelim (p, buf); /* Now we must skip the spaces to find '='. */ if (*buf != '=') { for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; } /* If we still don't have '=', something is amiss. */ //z 是否找到了 = ,如果没有找到 = ,可能出现了错误。 if (*buf != '=') continue; /* Find the beginning of attribute value by skipping the spaces. */ ++buf, --bufsize; //z 越过若干个空白字符。 for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); //z 是否结束或者找到了‘>’ if (!bufsize || *buf == '>') break; ph = NULL; /* The value of an attribute can, but does not have to be quoted. */ //z 如果当前字符为 ' 或者 " , 进入引号状态 if (*buf == '\"' || *buf == '\'') { //z 进入 quote 状态 s->in_quote = 1; //z 记住当前的字符串,方便寻找到下一个做比对。 s->quote_char = *buf; //z p 指向引号内第一个字符 p = buf + 1; //z 步进,直到找到另一个引号,或者遇到了回车 for (++buf, --bufsize; bufsize && *buf != s->quote_char && *buf != '\n'; ++buf, --bufsize) //z 如果当前字符串为 # , 记录下其位置 if (*buf == '#') ph = buf; if (!bufsize) { //z 如果到达了字符结尾,结束 in_quote 状态 s->in_quote = 0; break; } //z 如果遇到了 '\n' ,继续下一轮。 if (*buf == '\n') { /* #### Is the following logic good? Obviously no longer in quote. It might be well to check whether '>' was encountered, but that would be encouraging writers of invalid HTMLs, and we don't want that, now do we? */ s->in_quote = 0; continue; } } else { p = buf; for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize) if (*buf == '#') ph = buf; if (!bufsize) break; } //z URI 中的# ( found unprotected 是什么意思? ) # 可能表示的意思是一个 html marker 或者 color spec 。 /* If '#' was found unprotected in a URI, it is probably an HTML marker, or color spec. */ //z 如果有 # ,那么将 ph 视作结束? *size = (ph ? ph : buf) - p; /* The URI is liable to be returned if: 1) *size != 0; 2) its tag and attribute are found in html_allow. */ //z 实际可能表示的例子有 : <a href="http://www.w3school.com.cn/">Visit W3School</a> 这个样子 if (*size && idmatch (html_allow, s->tag, s->attr)) { if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href")) { FREE_MAYBE (s->base); s->base = strdupdelim (p, buf); } //z 比对 meta 和 content else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content")) { /* Some pages use a META tag to specify that the page be refreshed by a new page after a given number of seconds. We need to attempt to extract an URL for the new page from the other garbage present. The general format for this is: <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html"> So we just need to skip past the "0; URL=" garbage to get to the URL. META tags are also used for specifying random things like the page author's name and what editor was used to create it. So we need to be careful to ignore them and not assume that an URL will be present at all. */ //z 只要是数字,那么持续向前 for (; *size && ISDIGIT (*p); p++, *size -= 1); //z 查看是否会遇到 ; if (*p == ';') { //z 跳过 space。 for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ; //z 比对,是否找到了 URL, if (!strncasecmp (p, "URL=", 4)) { //z 如果在 meta 中找到了 URL p += 4, *size -= 4; s->at_value = 1; //z 这意思是直接返回 p? return p; } } } else { s->at_value = 1; return p; } } /* Exit from quote. */ if (*buf == s->quote_char) { s->in_quote = 0; ++buf, --bufsize; } } while (*buf != '>'); FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); s->tag = s->attr = NULL; if (!bufsize) break; } FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); FREE_MAYBE (s->base); memset (s, 0, sizeof (*s)); /* just to be sure */ DEBUGP (("HTML parser ends here (state destroyed).\n")); return NULL; }
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags describing URLs to follow. When a tag is encountered, extract its components (as described by html_allow[] array), and return the address and the length of the string. Return NULL if no URL is found. */ const char * htmlfindurl (const char *buf, int bufsize, int *size, int init) { const char *p, *ph; state_t *s; /* NULL-terminated list of tags and modifiers someone would want to follow -- feel free to edit to suit your needs: */ static struct tag_attr html_allow[] = { { "a", "href" }, { "img", "src" }, { "img", "href" }, { "body", "background" }, { "frame", "src" }, { "iframe", "src" }, { "fig", "src" }, { "overlay", "src" }, { "applet", "code" }, { "script", "src" }, { "embed", "src" }, { "bgsound", "src" }, { "area", "href" }, { "img", "lowsrc" }, { "input", "src" }, { "layer", "src" }, { "table", "background"}, { "th", "background"}, { "td", "background"}, /* Tags below this line are treated specially. */ { "base", "href" }, { "meta", "content" }, { NULL, NULL } }; s = &global_state; if (init) { DEBUGP (("Resetting a parser state.\n")); memset (s, 0, sizeof (*s)); } while (1) { if (!bufsize) break; /* Let's look for a tag, if we are not already in one. */ if (!s->at_value) { /* Find '<'. */ if (*buf != '<') for (; bufsize && *buf != '<'; ++buf, --bufsize); if (!bufsize) break; /* Skip spaces. */ for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize) break; p = buf; /* Find the tag end. */ for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; ++buf, --bufsize); if (!bufsize) break; if (*buf == '=') { /* <tag=something> is illegal. Just skip it. */ ++buf, --bufsize; continue; } if (p == buf) { /* *buf == '>'. */ ++buf, --bufsize; continue; } s->tag = strdupdelim (p, buf); if (*buf == '>') { free (s->tag); s->tag = NULL; ++buf, --bufsize; continue; } } else /* s->at_value */ { /* Reset AT_VALUE. */ s->at_value = 0; /* If in quotes, just skip out of them and continue living. */ if (s->in_quote) { s->in_quote = 0; for (; bufsize && *buf != s->quote_char; ++buf, --bufsize); if (!bufsize) break; ++buf, --bufsize; } if (!bufsize) break; if (*buf == '>') { FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); s->tag = s->attr = NULL; continue; } } /* Find the attributes. */ do { FREE_MAYBE (s->attr); s->attr = NULL; if (!bufsize) break; /* Skip the spaces if we have them. We don't have them at places like <img alt="something"src="something-else">. ^ no spaces here */ if (ISSPACE (*buf)) for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; if (*buf == '=') { /* This is the case of <tag = something>, which is illegal. Just skip it. */ ++buf, --bufsize; continue; } p = buf; /* Find the attribute end. */ for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; ++buf, --bufsize); if (!bufsize || *buf == '>') break; /* Construct the attribute. */ s->attr = strdupdelim (p, buf); /* Now we must skip the spaces to find '='. */ if (*buf != '=') { for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; } /* If we still don't have '=', something is amiss. */ if (*buf != '=') continue; /* Find the beginning of attribute value by skipping the spaces. */ ++buf, --bufsize; for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; ph = NULL; /* The value of an attribute can, but does not have to be quoted. */ if (*buf == '\"' || *buf == '\'') { s->in_quote = 1; s->quote_char = *buf; p = buf + 1; for (++buf, --bufsize; bufsize && *buf != s->quote_char && *buf != '\n'; ++buf, --bufsize) if (*buf == '#') ph = buf; if (!bufsize) { s->in_quote = 0; break; } if (*buf == '\n') { /* #### Is the following logic good? Obviously no longer in quote. It might be well to check whether '>' was encountered, but that would be encouraging writers of invalid HTMLs, and we don't want that, now do we? */ s->in_quote = 0; continue; } } else { p = buf; for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize) if (*buf == '#') ph = buf; if (!bufsize) break; } /* If '#' was found unprotected in a URI, it is probably an HTML marker, or color spec. */ *size = (ph ? ph : buf) - p; /* The URI is liable to be returned if: 1) *size != 0; 2) its tag and attribute are found in html_allow. */ if (*size && idmatch (html_allow, s->tag, s->attr)) { if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href")) { FREE_MAYBE (s->base); s->base = strdupdelim (p, buf); } else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content")) { /* Some pages use a META tag to specify that the page be refreshed by a new page after a given number of seconds. We need to attempt to extract an URL for the new page from the other garbage present. The general format for this is: <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html"> So we just need to skip past the "0; URL=" garbage to get to the URL. META tags are also used for specifying random things like the page author's name and what editor was used to create it. So we need to be careful to ignore them and not assume that an URL will be present at all. */ for (; *size && ISDIGIT (*p); p++, *size -= 1); if (*p == ';') { for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ; if (!strncasecmp (p, "URL=", 4)) { p += 4, *size -= 4; s->at_value = 1; return p; } } } else { s->at_value = 1; return p; } } /* Exit from quote. */ if (*buf == s->quote_char) { s->in_quote = 0; ++buf, --bufsize; } } while (*buf != '>'); FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); s->tag = s->attr = NULL; if (!bufsize) break; } FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); FREE_MAYBE (s->base); memset (s, 0, sizeof (*s)); /* just to be sure */ DEBUGP (("HTML parser ends here (state destroyed).\n")); return NULL; }
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags describing URLs to follow. When a tag is encountered, extract its components (as described by html_allow[] array), and return the address and the length of the string. Return NULL if no URL is found. */ const char * htmlfindurl (const char *buf, int bufsize, int *size, int init, int dash_p_leaf_HTML) { const char *p, *ph; state_t *s = &global_state; /* NULL-terminated list of tags and modifiers someone would want to follow -- feel free to edit to suit your needs: */ static struct tag_attr html_allow[] = { { "script", "src" }, { "img", "src" }, { "img", "href" }, { "body", "background" }, { "frame", "src" }, { "iframe", "src" }, { "fig", "src" }, { "overlay", "src" }, { "applet", "code" }, { "script", "src" }, { "embed", "src" }, { "bgsound", "src" }, { "img", "lowsrc" }, { "input", "src" }, { "layer", "src" }, { "table", "background"}, { "th", "background"}, { "td", "background"}, /* Tags below this line are treated specially. */ { "a", "href" }, { "area", "href" }, { "base", "href" }, { "link", "href" }, { "link", "rel" }, { "meta", "content" }, { NULL, NULL } }; if (init) { DEBUGP (("Resetting a parser state.\n")); memset (s, 0, sizeof (*s)); } while (1) { const char* link_href = NULL; const char* link_rel = NULL; int link_href_saved_size = 0; /* init. just to shut up warning */ if (!bufsize) break; /* Let's look for a tag, if we are not already in one. */ if (!s->at_value) { /* Find '<'. */ if (*buf != '<') for (; bufsize && *buf != '<'; ++buf, --bufsize); if (!bufsize) break; /* Skip spaces. */ for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize) break; p = buf; /* Find the tag end. */ for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; ++buf, --bufsize); if (!bufsize) break; if (*buf == '=') { /* <tag=something> is illegal. Just skip it. */ ++buf, --bufsize; continue; } if (p == buf) { /* *buf == '>'. */ ++buf, --bufsize; continue; } s->tag = strdupdelim (p, buf); if (*buf == '>') { free (s->tag); s->tag = NULL; ++buf, --bufsize; continue; } } else /* s->at_value */ { /* Reset AT_VALUE. */ s->at_value = 0; /* If in quotes, just skip out of them and continue living. */ if (s->in_quote) { s->in_quote = 0; for (; bufsize && *buf != s->quote_char; ++buf, --bufsize); if (!bufsize) break; ++buf, --bufsize; } if (!bufsize) break; if (*buf == '>') { FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); s->tag = s->attr = NULL; continue; } } /* Find the attributes. */ do { FREE_MAYBE (s->attr); s->attr = NULL; if (!bufsize) break; /* Skip the spaces if we have them. We don't have them at places like <img alt="something"src="something-else">. ^ no spaces here */ if (ISSPACE (*buf)) for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; if (*buf == '=') { /* This is the case of <tag = something>, which is illegal. Just skip it. */ ++buf, --bufsize; continue; } p = buf; /* Find the attribute end. */ for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; ++buf, --bufsize); if (!bufsize || *buf == '>') break; /* Construct the attribute. */ s->attr = strdupdelim (p, buf); /* Now we must skip the spaces to find '='. */ if (*buf != '=') { for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; } /* If we still don't have '=', something is amiss. */ if (*buf != '=') continue; /* Find the beginning of attribute value by skipping the spaces. */ ++buf, --bufsize; for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); if (!bufsize || *buf == '>') break; ph = NULL; /* The value of an attribute can, but does not have to be quoted. */ if (*buf == '\"' || *buf == '\'') { s->in_quote = 1; s->quote_char = *buf; p = buf + 1; for (++buf, --bufsize; bufsize && *buf != s->quote_char && *buf != '\n'; ++buf, --bufsize) if (!ph && *buf == '#' && *(buf - 1) != '&') ph = buf; if (!bufsize) { s->in_quote = 0; break; } if (*buf == '\n') { /* #### Is the following logic good? Obviously no longer in quote. It might be well to check whether '>' was encountered, but that would be encouraging writers of invalid HTMLs, and we don't want that, now do we? */ s->in_quote = 0; continue; } } else { p = buf; for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize) if (!ph && *buf == '#' && *(buf - 1) != '&') ph = buf; if (!bufsize) break; } /* If '#' was found unprotected in a URI, it is probably an HTML marker, or color spec. */ *size = (ph ? ph : buf) - p; /* The URI is liable to be returned if: 1) *size != 0; 2) its tag and attribute are found in html_allow. */ if (*size && idmatch (html_allow, s->tag, s->attr)) { if (strcasecmp(s->tag, "a") == EQ || strcasecmp(s->tag, "area") == EQ) { /* Only follow these if we're not at a -p leaf node, as they always link to external documents. */ if (!dash_p_leaf_HTML) { s->at_value = 1; return p; } } else if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href")) { FREE_MAYBE (s->base); s->base = strdupdelim (p, buf); } else if (strcasecmp(s->tag, "link") == EQ) { if (strcasecmp(s->attr, "href") == EQ) { link_href = p; link_href_saved_size = *size; /* for restoration below */ } else if (strcasecmp(s->attr, "rel") == EQ) link_rel = p; if (link_href != NULL && link_rel != NULL) /* Okay, we've now seen this <LINK> tag's HREF and REL attributes (they may be in either order), so it's now possible to decide if we want to traverse it. */ if (!dash_p_leaf_HTML || strncasecmp(link_rel, "stylesheet", sizeof("stylesheet") - 1) == EQ) /* In the normal case, all <LINK> tags are fair game. In the special case of when -p is active, however, and we're at a leaf node (relative to the -l max. depth) in the HTML document tree, the only <LINK> tag we'll follow is a <LINK REL="stylesheet">, as it's necessary for displaying this document properly. We won't follow other <LINK> tags, like <LINK REL="home">, for instance, as they refer to external documents. Note that the above strncasecmp() will incorrectly consider something like '<LINK REL="stylesheet.old"' as equivalent to '<LINK REL="stylesheet"'. Not really worth the trouble to explicitly check for such cases -- if time is spent, it should be spent ripping out wget's somewhat kludgy HTML parser and hooking in a real, componentized one. */ { /* When we return, the 'size' IN/OUT parameter determines where in the buffer the end of the current attribute value is. If REL came after HREF in this <LINK> tag, size is currently set to the size for REL's value -- set it to what it was when we were looking at HREF's value. */ *size = link_href_saved_size; s->at_value = 1; return link_href; } } else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content")) { /* Some pages use a META tag to specify that the page be refreshed by a new page after a given number of seconds. We need to attempt to extract an URL for the new page from the other garbage present. The general format for this is: <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html"> So we just need to skip past the "0; URL=" garbage to get to the URL. META tags are also used for specifying random things like the page author's name and what editor was used to create it. So we need to be careful to ignore them and not assume that an URL will be present at all. */ for (; *size && ISDIGIT (*p); p++, *size -= 1); if (*p == ';') { for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ; if (!strncasecmp (p, "URL=", 4)) { p += 4, *size -= 4; s->at_value = 1; return p; } } } else { s->at_value = 1; return p; } } /* Exit from quote. */ if (*buf == s->quote_char) { s->in_quote = 0; ++buf, --bufsize; } } while (*buf != '>'); FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); s->tag = s->attr = NULL; if (!bufsize) break; } FREE_MAYBE (s->tag); FREE_MAYBE (s->attr); FREE_MAYBE (s->base); memset (s, 0, sizeof (*s)); /* just to be sure */ DEBUGP (("HTML parser ends here (state destroyed).\n")); return NULL; }