void parse_urls(const char *filename, const url_list_t *elem) { TidyDoc tdoc; int err; FILE *outfile = NULL; tdoc = tidyCreate(); tidyOptSetBool(tdoc, TidyForceOutput, yes); tidyOptSetBool(tdoc, TidyMark, no); tidyOptSetBool(tdoc, TidyHideEndTags, yes); tidyOptSetBool(tdoc, TidyDropEmptyParas, no); tidyOptSetBool(tdoc, TidyJoinStyles, no); tidyOptSetBool(tdoc, TidyPreserveEntities, yes); tidyOptSetInt(tdoc, TidyMergeDivs, no); tidyOptSetInt(tdoc, TidyMergeSpans, no); tidyOptSetInt(tdoc, TidyWrapLen, 4096); tidyOptSetValue(tdoc, TidyCharEncoding, "utf8"); tidySetReportFilter(tdoc, filter_cb); err = tidyParseFile(tdoc, filename); if (err >= 0) err = tidyCleanAndRepair(tdoc); if (err >= 0) { outfile = option_values.save_relative_links && !option_values.disable_save_tree ? fopen(filename, "w") : NULL; parse_html(tdoc, tidyGetRoot(tdoc), elem, 1, outfile); if (outfile) fclose(outfile); } tidyRelease(tdoc); }
/* * Append a line of text to the buffer */ void html_text_buffer_append(GtkTextView *text_view, char *txt, int ignore) { gchar *text = convert_to_utf8(txt); GtkTextIter iter; GtkTextMark *insert_mark; GdkRectangle iter_loc; GdkRectangle visible_rect; GtkTextBuffer *buffer = gtk_text_view_get_buffer(text_view); if (strcasestr(text, "<br>")) { char *c = text; while ((c = strchr(text, '\n')) != 0) *c = ' '; while ((c = strchr(text, '\r')) != 0) *c = ' '; } else if (strchr(text, '\r')) { char *c = text; if (strchr(text, '\n')) { while ((c = strchr(c, '\r')) != 0) *c = ' '; } else { while ((c = strchr(c, '\r')) != 0) *c = '\n'; } } gtk_text_buffer_get_end_iter(buffer, &iter); insert_mark = gtk_text_buffer_get_mark(buffer, "real_end_mark"); if (insert_mark) { GtkTextIter del; gtk_text_buffer_get_iter_at_mark(buffer, &del, insert_mark); gtk_text_buffer_delete(buffer, &del, &iter); gtk_text_buffer_get_end_iter(buffer, &iter); } else insert_mark = gtk_text_buffer_create_mark(buffer, "real_end_mark", &iter, TRUE); /* Decide first if we want to scroll the text to the end or not */ gtk_text_view_get_iter_location(text_view, &iter, &iter_loc); gtk_text_view_get_visible_rect(text_view, &visible_rect); gtk_text_buffer_insert(buffer, &iter, text, -1); parse_html(text_view, *insert_mark, ignore); if (iter_loc.y <= visible_rect.y + visible_rect.height) { GtkTextMark *end_mark; gtk_text_buffer_get_end_iter(buffer, &iter); end_mark = gtk_text_buffer_create_mark(buffer, NULL, &iter, TRUE); gtk_text_view_scroll_mark_onscreen(text_view, end_mark); gtk_text_buffer_delete_mark(buffer, end_mark); } if (!(ignore & HTML_IGNORE_END)) gtk_text_buffer_delete_mark(buffer, insert_mark); g_free(text); }
void format_table(unsigned char *attr, unsigned char *html, unsigned char *eof, unsigned char **end, void *f) { struct part *p = f; int border, cellsp, vcellpd, cellpd, align; int frame, rules, width, wf; struct rgb bgcolor; struct table *t; char *al; int cye; int x; int i; /*int llm = last_link_to_move;*/ struct s_e *bad_html; int bad_html_n; struct node *n, *nn; int cpd_pass, cpd_width, cpd_last; /*if (!p->data) { debug("nested tables not supported"); return; }*/ table_level++; memcpy(&bgcolor, &par_format.bgcolor, sizeof(struct rgb)); get_bgcolor(attr, &bgcolor); if ((border = get_num(attr, "border")) == -1) border = has_attr(attr, "border") || has_attr(attr, "rules") || has_attr(attr, "frame"); /*if (!border) border = 1;*/ if ((cellsp = get_num(attr, "cellspacing")) == -1) cellsp = 1; if ((cellpd = get_num(attr, "cellpadding")) == -1) { vcellpd = 0; cellpd = !!border; } else { vcellpd = cellpd >= HTML_CHAR_HEIGHT / 2 + 1; cellpd = cellpd >= HTML_CHAR_WIDTH / 2 + 1; } if (!border) cellsp = 0; else if (!cellsp) cellsp = 1; if (border > 2) border = 2; if (cellsp > 2) cellsp = 2; align = par_format.align; if (align == AL_NO || align == AL_BLOCK) align = AL_LEFT; if ((al = get_attr_val(attr, "align"))) { if (!strcasecmp(al, "left")) align = AL_LEFT; if (!strcasecmp(al, "center")) align = AL_CENTER; if (!strcasecmp(al, "right")) align = AL_RIGHT; mem_free(al); } frame = F_BOX; if ((al = get_attr_val(attr, "frame"))) { if (!strcasecmp(al, "void")) frame = F_VOID; if (!strcasecmp(al, "above")) frame = F_ABOVE; if (!strcasecmp(al, "below")) frame = F_BELOW; if (!strcasecmp(al, "hsides")) frame = F_HSIDES; if (!strcasecmp(al, "vsides")) frame = F_VSIDES; if (!strcasecmp(al, "lhs")) frame = F_LHS; if (!strcasecmp(al, "rhs")) frame = F_RHS; if (!strcasecmp(al, "box")) frame = F_BOX; if (!strcasecmp(al, "border")) frame = F_BOX; mem_free(al); } rules = border ? R_ALL : R_NONE; if ((al = get_attr_val(attr, "rules"))) { if (!strcasecmp(al, "none")) rules = R_NONE; if (!strcasecmp(al, "groups")) rules = R_GROUPS; if (!strcasecmp(al, "rows")) rules = R_ROWS; if (!strcasecmp(al, "cols")) rules = R_COLS; if (!strcasecmp(al, "all")) rules = R_ALL; mem_free(al); } if (!border) frame = F_VOID; wf = 0; if ((width = get_width(attr, "width", p->data || p->xp)) == -1) { width = par_format.width - par_format.leftmargin - par_format.rightmargin; if (width < 0) width = 0; wf = 1; } if (!(t = parse_table(html, eof, end, &bgcolor, p->data || p->xp, &bad_html, &bad_html_n))) { mem_free(bad_html); goto ret0; } for (i = 0; i < bad_html_n; i++) { while (bad_html[i].s < bad_html[i].e && WHITECHAR(*bad_html[i].s)) bad_html[i].s++; while (bad_html[i].s < bad_html[i].e && WHITECHAR(bad_html[i].e[-1])) bad_html[i].e--; if (bad_html[i].s < bad_html[i].e) parse_html(bad_html[i].s, bad_html[i].e, put_chars_f, line_break_f, special_f, p, NULL); } mem_free(bad_html); html_stack_dup(); html_top.dontkill = 1; par_format.align = AL_LEFT; t->p = p; t->border = border; t->cellpd = cellpd; t->vcellpd = vcellpd; t->cellsp = cellsp; t->frame = frame; t->rules = rules; t->width = width; t->wf = wf; cpd_pass = 0; cpd_last = t->cellpd; cpd_width = 0; /* not needed, but let the warning go away */ again: get_cell_widths(t); if (get_column_widths(t)) goto ret2; get_table_width(t); if (!p->data && !p->xp) { if (!wf && t->max_t > width) t->max_t = width; if (t->max_t < t->min_t) t->max_t = t->min_t; if (t->max_t + par_format.leftmargin + par_format.rightmargin > p->xmax) p->xmax = t->max_t + par_format.leftmargin + par_format.rightmargin; if (t->min_t + par_format.leftmargin + par_format.rightmargin > p->x) p->x = t->min_t + par_format.leftmargin + par_format.rightmargin; goto ret2; } if (!cpd_pass && t->min_t > width && t->cellpd) { t->cellpd = 0; cpd_pass = 1; cpd_width = t->min_t; goto again; } if (cpd_pass == 1 && t->min_t > cpd_width) { t->cellpd = cpd_last; cpd_pass = 2; goto again; } /*debug("%d %d %d", t->min_t, t->max_t, width);*/ if (t->min_t >= width) distribute_widths(t, t->min_t); else if (t->max_t < width && wf) distribute_widths(t, t->max_t); else distribute_widths(t, width); if (!p->data && p->xp == 1) { int ww = t->rw + par_format.leftmargin + par_format.rightmargin; if (ww > par_format.width) ww = par_format.width; if (ww < t->rw) ww = t->rw; if (ww > p->x) p->x = ww; p->cy += t->rh; goto ret2; } #ifdef HTML_TABLE_2ND_PASS check_table_widths(t); #endif x = par_format.leftmargin; if (align == AL_CENTER) x = (par_format.width + par_format.leftmargin - par_format.rightmargin - t->rw) / 2; if (align == AL_RIGHT) x = par_format.width - par_format.rightmargin - t->rw; if (x + t->rw > par_format.width) x = par_format.width - t->rw; if (x < 0) x = 0; /*display_table(t, x, p->cy, &cye);*/ get_table_heights(t); if (!p->data) { if (t->rw + par_format.leftmargin + par_format.rightmargin > p->x) p->x = t->rw + par_format.leftmargin + par_format.rightmargin; p->cy += t->rh; goto ret2; } n = p->data->nodes.next; n->yw = p->yp - n->y + p->cy; display_complicated_table(t, x, p->cy, &cye); display_table_frames(t, x, p->cy); nn = mem_alloc(sizeof(struct node)); nn->x = n->x; nn->y = p->yp + cye; nn->xw = n->xw; add_to_list(p->data->nodes, nn); /*sdbg(p->data);*/ /*for (y = p->cy; y < cye; y++) { last_link_to_move = llm; align_line(p, y); }*/ /*if (p->cy + t->rh != cye) internal("size does not match; 1:%d, 2:%d", p->cy + t->rh, cye);*/ p->cy = cye; p->cx = -1; ret2: p->link_num = t->link_num; if (p->cy > p->y) p->y = p->cy; /*ret1:*/ free_table(t); kill_html_stack_item(&html_top); ret0: /*ret:*/ table_level--; if (!table_level) free_table_cache(); }
void do_format(char *start, char *end, struct part *part, unsigned char *head) { parse_html(start, end, (int (*)(void *, unsigned char *, int)) put_chars_conv, (void (*)(void *)) line_break, (void *(*)(void *, int, ...)) html_special, part, head); /*if ((part->y -= line_breax) < 0) part->y = 0;*/ }
static void parse_html(TidyDoc tdoc, TidyNode tnod, const url_list_t *elem, int indent, FILE *outfile) { TidyNode child; TidyAttr attr; TidyAttrId attr_id = TidyAttr_UNKNOWN; TidyNodeType node_type; TidyTagId node_id; ctmbstr name; char *url, *relative_url = NULL; int found = 0; int get_html_link = (!option_values.depth || elem->level < option_values.depth); int get_int_html_link = (!option_values.depth || elem->level < option_values.depth+1); int get_ext_depends = ((!option_values.depth || elem->level < option_values.depth+1) && !option_values.no_html_dependencies); for (child = tidyGetChild(tnod); child; child = tidyGetNext(child)) { node_type = tidyNodeGetType(child); switch (node_type) { case TidyNode_Start: case TidyNode_StartEnd: node_id = tidyNodeGetId(child); if (get_html_link && (node_id == TidyTag_A || node_id == TidyTag_AREA || node_id == TidyTag_MAP)) { found = 1; attr_id = TidyAttr_HREF; } else if (get_int_html_link && (node_id == TidyTag_FRAME || node_id == TidyTag_IFRAME)) { found = 1; attr_id = TidyAttr_SRC; } else if (get_ext_depends) { if (node_id == TidyTag_LINK) { found = 1; attr_id = TidyAttr_HREF; } else if (node_id == TidyTag_IMG || node_id == TidyTag_SCRIPT) { found = 1; attr_id = TidyAttr_SRC; } else { found = 0; attr_id = TidyAttr_UNKNOWN; } } else { found = 0; attr_id = TidyAttr_UNKNOWN; } if (found && (attr = tidyAttrGetById(child, attr_id)) != NULL) { url = (char *) tidyAttrValue(attr); string_free(relative_url); if (url && *url) add_new_url_and_check(elem, url, outfile ? &relative_url : NULL); } if (outfile && (name = tidyNodeGetName(child)) != NULL) { fprintf(outfile, "%*.*s%s", indent, indent, "<", name); for (attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr)) { fprintf(outfile, " %s", tidyAttrName(attr)); if (relative_url && (tidyAttrGetId(attr) == attr_id)) fprintf(outfile, "=\"%s\"", relative_url); else if (tidyAttrValue(attr)) fprintf(outfile, "=\"%s\"", tidyAttrValue(attr) ? tidyAttrValue(attr) : ""); else fprintf(outfile, "=\"\""); } string_free(relative_url); if (node_type == TidyNode_StartEnd) fprintf(outfile, "/>\n"); else { fprintf(outfile, ">\n"); parse_html(tdoc, child, elem, indent + 1, outfile); fprintf(outfile, "%*.*s%s>\n", indent + 1, indent + 1, "</", name); } } else { string_free(relative_url); parse_html(tdoc, child, elem, indent + 1, outfile); } break; case TidyNode_End: if (outfile) { if ((name = tidyNodeGetName(child)) != NULL) fprintf(outfile, "%*.*s/%s>\n", indent, indent, "<", name); } break; case TidyNode_Text: if (outfile) { TidyBuffer buf; TidyTagId parent_node_id = tidyNodeGetId(tnod); tidyBufInit(&buf); if (parent_node_id == TidyTag_SCRIPT || parent_node_id == TidyTag_STYLE) tidyNodeGetValue(tdoc, child, &buf); else tidyNodeGetText(tdoc, child, &buf); if (buf.bp) fprintf(outfile, "%s", (char *)buf.bp); tidyBufFree(&buf); } break; case TidyNode_Comment: if (outfile) { TidyBuffer buf; tidyBufInit(&buf); tidyNodeGetValue(tdoc, child, &buf); if (buf.bp) fprintf(outfile, "<!--%s-->\n", (char *)buf.bp); tidyBufFree(&buf); } break; case TidyNode_CDATA: if (outfile) { TidyBuffer buf; tidyBufInit(&buf); tidyNodeGetValue(tdoc, child, &buf); if (buf.bp) fprintf(outfile, "<![CDATA[%s]]>\n", (char *)buf.bp); tidyBufFree(&buf); } break; case TidyNode_DocType: if (outfile) { int pub = 0; fprintf(outfile, "<!DOCTYPE %s", tidyNodeGetName(child)); for (attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr)) { if (!pub) { fprintf(outfile, " %s", tidyAttrName(attr)); if (!string_casecmp(tidyAttrName(attr), "PUBLIC")) pub = 1; } if (tidyAttrValue(attr)) fprintf(outfile, " \"%s\"", tidyAttrValue(attr)); } fprintf(outfile, ">\n"); } break; default: if (outfile) { TidyBuffer buf; tidyBufInit(&buf); tidyNodeGetValue(tdoc, child, &buf); if (buf.bp) fprintf(outfile, "%s", (char *)buf.bp); tidyBufFree(&buf); } break; } } }