Exemple #1
0
static MessageBody* get_body(CollectedPart *body_part, gboolean sanitize_body, GPtrArray *inlines) {
  g_return_val_if_fail(body_part != NULL, NULL);

  MessageBody *mb = new_message_body();

  // We keep the raw size intentionally
  mb->size = body_part->content->len;

  mb->content_type = g_strdup(body_part->content_type);

  if (sanitize_body) {
    // Parse any HTML tags
    GString *raw_content = g_string_new_len((const gchar*) body_part->content->data, body_part->content->len);
    GumboOutput* output = gumbo_parse_with_options(&kGumboDefaultOptions, raw_content->str, raw_content->len);

    // Remove unallowed HTML tags (like scripts, bad href etc..)
    GString *sanitized_content = sanitize(output->document, inlines);
    mb->content = sanitized_content;

    gumbo_destroy_output(&kGumboDefaultOptions, output);
    g_string_free(raw_content, TRUE);
  } else {
    mb->content = g_string_new_len((const gchar*) body_part->content->data, body_part->content->len);
  }

  return mb;
}
Exemple #2
0
void GumboInterface::parse()
{
    if (!m_source.isEmpty() && (m_output == NULL)) {
  
        if (!m_version.startsWith('3')) {
            m_hasnbsp = m_source.contains(" ");
        }
        m_utf8src = m_source.toStdString();
        // remove any xml header line and any trailing whitespace
        if (m_utf8src.compare(0,5,"<?xml") == 0) {
            size_t end = m_utf8src.find_first_of('>', 5);
            end = m_utf8src.find_first_not_of("\n\r\t\v\f ",end+1);
            m_utf8src.erase(0,end);
        }

        // In case we ever have to revert to earlier versions, please note the following
        // additional initialization is needed because Microsoft Visual Studio 2013 (and earlier?)
        // do not properly initialize myoptions from the static const kGumboDefaultOptions defined
        // in the gumbo library.  Instead whatever was in memory at the time is used causing random 
        // issues later on so if reverting remember to keep these specific changes as the bug 
        // they work around took a long long time to track down
        GumboOptions myoptions = kGumboDefaultOptions;
        myoptions.tab_stop = 4;
        myoptions.use_xhtml_rules = true;
        myoptions.stop_on_first_error = false;
        myoptions.max_errors = -1;

        // GumboInterface::m_mutex.lock();
        m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length());
        // GumboInterface::m_mutex.unlock();
    }
}
int main(int argc, const char** argv){
	if(argc != 2){
		printf("得到html文件的各个节点(请加上html文件名作为执行参数)");
		exit(1);
	}
	const char* filename = argv[1];
	//
	FILE *fp;
	if(!(fp = fopen(filename, "r"))){
		printf("file open error!");
		exit(1);
	}
	//
	char* input;
	int length;
	readFile(fp, &input, &length);
	//
	GumboOutput* output = gumbo_parse_with_options(
		&kGumboDefaultOptions, input, length);
	findTag(output->root, 0);
	//
	gumbo_destroy_output(&kGumboDefaultOptions, output);
	fclose(fp);
	free(input);
}
Exemple #4
0
static int
es_gumbo_parse(duk_context *ctx)
{
  duk_size_t len;
  const char *str = duk_to_lstring(ctx, 0, &len);
  es_gumbo_output_t *ego = calloc(1, sizeof(es_gumbo_output_t));
  atomic_set(&ego->ego_refcount, 1);
  ego->ego_output =
    gumbo_parse_with_options(&kGumboDefaultOptions, str, len);

  /*
  for(int i = 0; i < ego->ego_output->errors.length; i++) {
    GumboError *ge = ego->ego_output->errors.data[i];
  }
  */
  duk_pop(ctx);

  duk_push_object(ctx);

  push_gumbo_node(ctx, ego->ego_output->document, ego);
  duk_put_prop_string(ctx, -2, "document");

  push_gumbo_node(ctx, ego->ego_output->root, ego);
  duk_put_prop_string(ctx, -2, "root");

  es_gumbo_output_release(ego);
  return 1;
}
Exemple #5
0
QList<GumboWellFormedError> GumboInterface::error_check()
{
    QList<GumboWellFormedError> errlist;
    int line_offset = 0;

    // In case we ever have to revert to earlier versions, please note the following
    // additional initialization is needed because Microsoft Visual Studio 2013 (and earlier?)
    // do not properly initialize myoptions from the static const kGumboDefaultOptions defined
    // in the gumbo library.  Instead whatever was in memory at the time is used causing random 
    // issues later on so if reverting remember to keep these specific changes as the bug 
    // they work around took a long long time to track down
    GumboOptions myoptions = kGumboDefaultOptions;
    myoptions.tab_stop = 4;
    myoptions.use_xhtml_rules = true;
    myoptions.stop_on_first_error = false;
    myoptions.max_errors = -1;

    if (!m_source.isEmpty() && (m_output == NULL)) {

        // fix any non html valid self-closing tags
        m_source = fix_self_closing_tags(m_source);

        m_utf8src = m_source.toStdString();
        // remove any xml header line and trailing whitespace
        if (m_utf8src.compare(0,5,"<?xml") == 0) {
            size_t end = m_utf8src.find_first_of('>', 0);
            end = m_utf8src.find_first_not_of("\n\r\t\v\f ",end+1);
            m_utf8src.erase(0,end);
            line_offset++;
        }
        // add in doctype if missing
        if ((m_utf8src.compare(0,9,"<!DOCTYPE") != 0) && (m_utf8src.compare(0,9,"<!doctype") != 0)) {
            m_utf8src.insert(0,"<!DOCTYPE html>\n");
            line_offset--;
        }
        // GumboInterface::m_mutex.lock();
        m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length());
        // GumboInterface::m_mutex.unlock();
    }
    const GumboVector* errors  = &m_output->errors;
    for (unsigned int i=0; i< errors->length; ++i) {
        GumboError* er = static_cast<GumboError*>(errors->data[i]);
        GumboWellFormedError gperror;
        gperror.line = er->position.line + line_offset;;
        gperror.column = er->position.column;
        // unsigned int typenum = er->type;
        GumboStringBuffer text;
        gumbo_string_buffer_init(&text);
        gumbo_error_to_string(er, &text);
        std::string errmsg(text.data, text.length);
        gperror.message = QString::fromStdString(errmsg);
        gumbo_string_buffer_destroy(&text);
        errlist.append(gperror);
    }
    return errlist;
}
Exemple #6
0
void GumboInterface::parse()
{
    if (!m_source.isEmpty() && (m_output == NULL)) {
        m_utf8src = m_source.toStdString();
        // remove any xml header line and any trailing whitespace
        if (m_utf8src.compare(0,5,"<?xml") == 0) {
            size_t end = m_utf8src.find_first_of('>', 5);
            end = m_utf8src.find_first_not_of("\n\r\t ",end+1);
            m_utf8src.erase(0,end);
        }
        GumboOptions myoptions = kGumboDefaultOptions;
        myoptions.use_xhtml_rules = true;
        myoptions.tab_stop = 4;
        m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length());
    }
}
Exemple #7
0
static int parse(lua_State *L) {
    size_t length;
    const char *input = luaL_checklstring(L, 1, &length);
    GumboOptions options = kGumboDefaultOptions;
    options.tab_stop = luaL_optint(L, 2, 8);
    GumboOutput *output = gumbo_parse_with_options(&options, input, length);
    if (output) {
        push_node(L, output->document);
        lua_rawgeti(L, -1, output->root->index_within_parent + 1);
        lua_setfield(L, -2, "root");
        gumbo_destroy_output(&options, output);
        return 1;
    } else {
        lua_pushnil(L);
        lua_pushliteral(L, "Failed to parse");
        return 2;
    }
}
Exemple #8
0
QList<GumboWellFormedError> GumboInterface::error_check()
{
    QList<GumboWellFormedError> errlist;
    int line_offset = 0;
    GumboOptions myoptions = kGumboDefaultOptions;
    myoptions.use_xhtml_rules = true;
    myoptions.tab_stop = 4;
    // leave this as false to prevent pre-mature stopping when no error exists
    myoptions.stop_on_first_error = false;

    if (!m_source.isEmpty() && (m_output == NULL)) {
        m_utf8src = m_source.toStdString();
        // remove any xml header line and trailing whitespace
        if (m_utf8src.compare(0,5,"<?xml") == 0) {
            size_t end = m_utf8src.find_first_of('>', 0);
            end = m_utf8src.find_first_not_of("\n\r\t ",end+1);
            m_utf8src.erase(0,end);
            line_offset++;
        }
        // add in doctype if missing
        if ((m_utf8src.compare(0,9,"<!DOCTYPE") != 0) && (m_utf8src.compare(0,9,"<!doctype") != 0)) {
            m_utf8src.insert(0,"<!DOCTYPE html>\n");
            line_offset--;
        }
        m_output = gumbo_parse_with_options(&myoptions, m_utf8src.data(), m_utf8src.length());
    }
    const GumboVector* errors  = &m_output->errors;
    for (int i=0; i< errors->length; ++i) {
        GumboError* er = static_cast<GumboError*>(errors->data[i]);
        GumboWellFormedError gperror;
        gperror.line = er->position.line + line_offset;;
        gperror.column = er->position.column;
        unsigned int typenum = er->type;
        GumboStringBuffer text;
        gumbo_string_buffer_init(&text);
        gumbo_error_to_string(er, &text);
        std::string errmsg(text.data, text.length);
        gperror.message = QString::fromStdString(errmsg);
        gumbo_string_buffer_destroy(&text);
        errlist.append(gperror);
    }
    return errlist;
}
Exemple #9
0
static int parse(lua_State *L) {
    size_t input_len, tagname_len;
    GumboOptions options = kGumboDefaultOptions;
    options.max_errors = 0;
    const char *input = luaL_checklstring(L, 1, &input_len);
    options.tab_stop = (int)luaL_optinteger(L, 2, 8);
    const char *tagname = luaL_optlstring(L, 3, NULL, &tagname_len);
    if (tagname != NULL) {
        options.fragment_context = gumbo_tagn_enum(tagname, tagname_len);
    }
    static const char *namespaces[] = {"html", "svg", "math", NULL};
    options.fragment_namespace = luaL_checkoption(L, 4, "html", namespaces);
    for (int i = 1; i <= nupvalues; i++) {
        luaL_checktype(L, i + 4, LUA_TTABLE);
    }
    lua_pushcclosure(L, push_document, nupvalues);
    GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
    if (output == NULL) {
        lua_pushnil(L);
        lua_pushliteral(L, "gumbo_parse_with_options() returned NULL");
        return 2;
    }
    GumboOutputStatus status = output->status;
    if (status != GUMBO_STATUS_OK) {
        gumbo_destroy_output(output);
        lua_pushnil(L);
        lua_pushstring(L, gumbo_status_to_string(status));
        return 2;
    }
    lua_pushlightuserdata(L, &output->document->v.document);
    int err = lua_pcall(L, 1, 1, 0);
    gumbo_destroy_output(output);
    if (err == 0) { // LUA_OK
        return 1;
    } else {
        lua_pushnil(L);
        lua_pushvalue(L, -2);
        return 2;
    }
}