static MessageBody* get_body(CollectedPart *body_part, gboolean sanitize_body, GPtrArray *inlines) { g_return_val_if_fail(body_part != NULL, NULL); MessageBody *mb = new_message_body(); // We keep the raw size intentionally mb->size = body_part->content->len; mb->content_type = g_strdup(body_part->content_type); if (sanitize_body) { // Parse any HTML tags GString *raw_content = g_string_new_len((const gchar*) body_part->content->data, body_part->content->len); GumboOutput* output = gumbo_parse_with_options(&kGumboDefaultOptions, raw_content->str, raw_content->len); // Remove unallowed HTML tags (like scripts, bad href etc..) GString *sanitized_content = sanitize(output->document, inlines); mb->content = sanitized_content; gumbo_destroy_output(&kGumboDefaultOptions, output); g_string_free(raw_content, TRUE); } else { mb->content = g_string_new_len((const gchar*) body_part->content->data, body_part->content->len); } return mb; }
void GumboInterface::parse() { if (!m_source.isEmpty() && (m_output == NULL)) { if (!m_version.startsWith('3')) { m_hasnbsp = m_source.contains(" "); } m_utf8src = m_source.toStdString(); // remove any xml header line and any trailing whitespace if (m_utf8src.compare(0,5,"<?xml") == 0) { size_t end = m_utf8src.find_first_of('>', 5); end = m_utf8src.find_first_not_of("\n\r\t\v\f ",end+1); m_utf8src.erase(0,end); } // In case we ever have to revert to earlier versions, please note the following // additional initialization is needed because Microsoft Visual Studio 2013 (and earlier?) // do not properly initialize myoptions from the static const kGumboDefaultOptions defined // in the gumbo library. Instead whatever was in memory at the time is used causing random // issues later on so if reverting remember to keep these specific changes as the bug // they work around took a long long time to track down GumboOptions myoptions = kGumboDefaultOptions; myoptions.tab_stop = 4; myoptions.use_xhtml_rules = true; myoptions.stop_on_first_error = false; myoptions.max_errors = -1; // GumboInterface::m_mutex.lock(); m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length()); // GumboInterface::m_mutex.unlock(); } }
int main(int argc, const char** argv){ if(argc != 2){ printf("得到html文件的各个节点(请加上html文件名作为执行参数)"); exit(1); } const char* filename = argv[1]; // FILE *fp; if(!(fp = fopen(filename, "r"))){ printf("file open error!"); exit(1); } // char* input; int length; readFile(fp, &input, &length); // GumboOutput* output = gumbo_parse_with_options( &kGumboDefaultOptions, input, length); findTag(output->root, 0); // gumbo_destroy_output(&kGumboDefaultOptions, output); fclose(fp); free(input); }
static int es_gumbo_parse(duk_context *ctx) { duk_size_t len; const char *str = duk_to_lstring(ctx, 0, &len); es_gumbo_output_t *ego = calloc(1, sizeof(es_gumbo_output_t)); atomic_set(&ego->ego_refcount, 1); ego->ego_output = gumbo_parse_with_options(&kGumboDefaultOptions, str, len); /* for(int i = 0; i < ego->ego_output->errors.length; i++) { GumboError *ge = ego->ego_output->errors.data[i]; } */ duk_pop(ctx); duk_push_object(ctx); push_gumbo_node(ctx, ego->ego_output->document, ego); duk_put_prop_string(ctx, -2, "document"); push_gumbo_node(ctx, ego->ego_output->root, ego); duk_put_prop_string(ctx, -2, "root"); es_gumbo_output_release(ego); return 1; }
QList<GumboWellFormedError> GumboInterface::error_check() { QList<GumboWellFormedError> errlist; int line_offset = 0; // In case we ever have to revert to earlier versions, please note the following // additional initialization is needed because Microsoft Visual Studio 2013 (and earlier?) // do not properly initialize myoptions from the static const kGumboDefaultOptions defined // in the gumbo library. Instead whatever was in memory at the time is used causing random // issues later on so if reverting remember to keep these specific changes as the bug // they work around took a long long time to track down GumboOptions myoptions = kGumboDefaultOptions; myoptions.tab_stop = 4; myoptions.use_xhtml_rules = true; myoptions.stop_on_first_error = false; myoptions.max_errors = -1; if (!m_source.isEmpty() && (m_output == NULL)) { // fix any non html valid self-closing tags m_source = fix_self_closing_tags(m_source); m_utf8src = m_source.toStdString(); // remove any xml header line and trailing whitespace if (m_utf8src.compare(0,5,"<?xml") == 0) { size_t end = m_utf8src.find_first_of('>', 0); end = m_utf8src.find_first_not_of("\n\r\t\v\f ",end+1); m_utf8src.erase(0,end); line_offset++; } // add in doctype if missing if ((m_utf8src.compare(0,9,"<!DOCTYPE") != 0) && (m_utf8src.compare(0,9,"<!doctype") != 0)) { m_utf8src.insert(0,"<!DOCTYPE html>\n"); line_offset--; } // GumboInterface::m_mutex.lock(); m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length()); // GumboInterface::m_mutex.unlock(); } const GumboVector* errors = &m_output->errors; for (unsigned int i=0; i< errors->length; ++i) { GumboError* er = static_cast<GumboError*>(errors->data[i]); GumboWellFormedError gperror; gperror.line = er->position.line + line_offset;; gperror.column = er->position.column; // unsigned int typenum = er->type; GumboStringBuffer text; gumbo_string_buffer_init(&text); gumbo_error_to_string(er, &text); std::string errmsg(text.data, text.length); gperror.message = QString::fromStdString(errmsg); gumbo_string_buffer_destroy(&text); errlist.append(gperror); } return errlist; }
void GumboInterface::parse() { if (!m_source.isEmpty() && (m_output == NULL)) { m_utf8src = m_source.toStdString(); // remove any xml header line and any trailing whitespace if (m_utf8src.compare(0,5,"<?xml") == 0) { size_t end = m_utf8src.find_first_of('>', 5); end = m_utf8src.find_first_not_of("\n\r\t ",end+1); m_utf8src.erase(0,end); } GumboOptions myoptions = kGumboDefaultOptions; myoptions.use_xhtml_rules = true; myoptions.tab_stop = 4; m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length()); } }
static int parse(lua_State *L) { size_t length; const char *input = luaL_checklstring(L, 1, &length); GumboOptions options = kGumboDefaultOptions; options.tab_stop = luaL_optint(L, 2, 8); GumboOutput *output = gumbo_parse_with_options(&options, input, length); if (output) { push_node(L, output->document); lua_rawgeti(L, -1, output->root->index_within_parent + 1); lua_setfield(L, -2, "root"); gumbo_destroy_output(&options, output); return 1; } else { lua_pushnil(L); lua_pushliteral(L, "Failed to parse"); return 2; } }
QList<GumboWellFormedError> GumboInterface::error_check() { QList<GumboWellFormedError> errlist; int line_offset = 0; GumboOptions myoptions = kGumboDefaultOptions; myoptions.use_xhtml_rules = true; myoptions.tab_stop = 4; // leave this as false to prevent pre-mature stopping when no error exists myoptions.stop_on_first_error = false; if (!m_source.isEmpty() && (m_output == NULL)) { m_utf8src = m_source.toStdString(); // remove any xml header line and trailing whitespace if (m_utf8src.compare(0,5,"<?xml") == 0) { size_t end = m_utf8src.find_first_of('>', 0); end = m_utf8src.find_first_not_of("\n\r\t ",end+1); m_utf8src.erase(0,end); line_offset++; } // add in doctype if missing if ((m_utf8src.compare(0,9,"<!DOCTYPE") != 0) && (m_utf8src.compare(0,9,"<!doctype") != 0)) { m_utf8src.insert(0,"<!DOCTYPE html>\n"); line_offset--; } m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length()); } const GumboVector* errors = &m_output->errors; for (int i=0; i< errors->length; ++i) { GumboError* er = static_cast<GumboError*>(errors->data[i]); GumboWellFormedError gperror; gperror.line = er->position.line + line_offset;; gperror.column = er->position.column; unsigned int typenum = er->type; GumboStringBuffer text; gumbo_string_buffer_init(&text); gumbo_error_to_string(er, &text); std::string errmsg(text.data, text.length); gperror.message = QString::fromStdString(errmsg); gumbo_string_buffer_destroy(&text); errlist.append(gperror); } return errlist; }
static int parse(lua_State *L) { size_t input_len, tagname_len; GumboOptions options = kGumboDefaultOptions; options.max_errors = 0; const char *input = luaL_checklstring(L, 1, &input_len); options.tab_stop = (int)luaL_optinteger(L, 2, 8); const char *tagname = luaL_optlstring(L, 3, NULL, &tagname_len); if (tagname != NULL) { options.fragment_context = gumbo_tagn_enum(tagname, tagname_len); } static const char *namespaces[] = {"html", "svg", "math", NULL}; options.fragment_namespace = luaL_checkoption(L, 4, "html", namespaces); for (int i = 1; i <= nupvalues; i++) { luaL_checktype(L, i + 4, LUA_TTABLE); } lua_pushcclosure(L, push_document, nupvalues); GumboOutput *output = gumbo_parse_with_options(&options, input, input_len); if (output == NULL) { lua_pushnil(L); lua_pushliteral(L, "gumbo_parse_with_options() returned NULL"); return 2; } GumboOutputStatus status = output->status; if (status != GUMBO_STATUS_OK) { gumbo_destroy_output(output); lua_pushnil(L); lua_pushstring(L, gumbo_status_to_string(status)); return 2; } lua_pushlightuserdata(L, &output->document->v.document); int err = lua_pcall(L, 1, 1, 0); gumbo_destroy_output(output); if (err == 0) { // LUA_OK return 1; } else { lua_pushnil(L); lua_pushvalue(L, -2); return 2; } }