static void sc_html_parse_special(SC_HTMLParser *parser) { gchar symbol_name[9]; gint n; const gchar *val; parser->state = SC_HTML_UNKNOWN; cm_return_if_fail(*parser->bufp == '&'); /* &foo; */ for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++) ; if (n > 7 || parser->bufp[n] != ';') { /* output literal `&' */ sc_html_append_char(parser, *parser->bufp++); parser->state = SC_HTML_NORMAL; return; } strncpy2(symbol_name, parser->bufp, n + 2); parser->bufp += n + 1; if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name)) != NULL) { sc_html_append_str(parser, val, -1); parser->state = SC_HTML_NORMAL; return; } sc_html_append_str(parser, symbol_name, -1); }
gchar *sc_html_parse(SC_HTMLParser *parser) { parser->state = SC_HTML_NORMAL; g_string_truncate(parser->str, 0); if (*parser->bufp == '\0') { g_string_truncate(parser->buf, 0); parser->bufp = parser->buf->str; if (sc_html_read_line(parser) == SC_HTML_EOF) return NULL; } while (*parser->bufp != '\0') { switch (*parser->bufp) { case '<': { SC_HTMLState st; st = sc_html_parse_tag(parser); /* when we see an href, we need to flush the str * buffer. Then collect all the chars until we * see the end anchor tag */ if (SC_HTML_HREF_BEG == st || SC_HTML_HREF == st) return parser->str->str; } break; case '&': sc_html_parse_special(parser); break; case ' ': case '\t': case '\r': case '\n': if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n') parser->bufp++; if (!parser->pre) { if (!parser->newline) parser->space = TRUE; parser->bufp++; break; } /* fallthrough */ default: sc_html_append_char(parser, *parser->bufp++); } } return parser->str->str; }
static void sc_html_parse_special(SC_HTMLParser *parser) { gchar *entity; parser->state = SC_HTML_UNKNOWN; cm_return_if_fail(*parser->bufp == '&'); entity = entity_decode(parser->bufp); if (entity != NULL) { sc_html_append_str(parser, entity, -1); g_free(entity); while (*parser->bufp++ != ';'); } else { /* output literal `&' */ sc_html_append_char(parser, *parser->bufp++); } parser->state = SC_HTML_NORMAL; }
static SC_HTMLState sc_html_parse_tag(SC_HTMLParser *parser) { gchar buf[SC_HTMLBUFSIZE]; SC_HTMLTag *tag; sc_html_get_parenthesis(parser, buf, sizeof(buf)); tag = sc_html_get_tag(buf); parser->state = SC_HTML_UNKNOWN; if (!tag) return SC_HTML_UNKNOWN; if (!strcmp(tag->name, "br") || !strcmp(tag->name, "br/")) { parser->space = FALSE; sc_html_append_char(parser, '\n'); parser->state = SC_HTML_BR; } else if (!strcmp(tag->name, "a")) { GList *cur; if (parser->href != NULL) { g_free(parser->href); parser->href = NULL; } for (cur = tag->attr; cur != NULL; cur = cur->next) { if (cur->data && !strcmp(((SC_HTMLAttr *)cur->data)->name, "href")) { g_free(parser->href); parser->href = g_strdup(((SC_HTMLAttr *)cur->data)->value); decode_href(parser); parser->state = SC_HTML_HREF_BEG; break; } } if (parser->href == NULL) parser->href = g_strdup(""); parser->state = SC_HTML_HREF_BEG; } else if (!strcmp(tag->name, "/a")) { parser->state = SC_HTML_HREF; } else if (!strcmp(tag->name, "p")) { parser->space = FALSE; if (!parser->empty_line) { parser->space = FALSE; if (!parser->newline) sc_html_append_char(parser, '\n'); sc_html_append_char(parser, '\n'); } parser->state = SC_HTML_PAR; } else if (!strcmp(tag->name, "pre")) { parser->pre = TRUE; parser->state = SC_HTML_PRE; } else if (!strcmp(tag->name, "/pre")) { parser->pre = FALSE; parser->state = SC_HTML_NORMAL; } else if (!strcmp(tag->name, "hr")) { if (!parser->newline) { parser->space = FALSE; sc_html_append_char(parser, '\n'); } sc_html_append_str(parser, HR_STR, -1); sc_html_append_char(parser, '\n'); parser->state = SC_HTML_HR; } else if (!strcmp(tag->name, "div") || !strcmp(tag->name, "ul") || !strcmp(tag->name, "li") || !strcmp(tag->name, "table") || !strcmp(tag->name, "dd") || !strcmp(tag->name, "tr")) { if (!parser->newline) { parser->space = FALSE; sc_html_append_char(parser, '\n'); } if (!strcmp(tag->name, "li")) { sc_html_append_str(parser, LI_STR, -1); } parser->state = SC_HTML_NORMAL; } else if (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1])) { if (!parser->newline) { parser->space = FALSE; sc_html_append_char(parser, '\n'); } sc_html_append_char(parser, '\n'); } else if (!strcmp(tag->name, "blockquote")) { parser->state = SC_HTML_NORMAL; parser->indent++; } else if (!strcmp(tag->name, "/blockquote")) { parser->state = SC_HTML_NORMAL; parser->indent--; } else if (!strcmp(tag->name, "/table") || (tag->name[0] == '/' && tag->name[1] == 'h' && g_ascii_isdigit(tag->name[2]))) { if (!parser->empty_line) { parser->space = FALSE; if (!parser->newline) sc_html_append_char(parser, '\n'); sc_html_append_char(parser, '\n'); } parser->state = SC_HTML_NORMAL; } else if (!strcmp(tag->name, "/div") || !strcmp(tag->name, "/ul") || !strcmp(tag->name, "/li")) { if (!parser->newline) { parser->space = FALSE; sc_html_append_char(parser, '\n'); } parser->state = SC_HTML_NORMAL; } sc_html_free_tag(tag); return parser->state; }