int html2text(char *text, const char *content) { int ret; xmlNodePtr root; xmlErrorPtr err; htmlParserCtxtPtr parser; parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0); /* htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); */ htmlCtxtUseOptions(parser, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); ret = htmlParseChunk(parser, content, xmlStrlen(content), 0); if (ret != 0) { err = xmlCtxtGetLastError(parser); fprintf(stderr, "htmlParseChunk failure: %d: %s\n", \ ret, err->message); } ret = htmlParseChunk(parser, NULL, 0, 1); if (ret != 0) { err = xmlCtxtGetLastError(parser); fprintf(stderr, "htmlParseChunk failure 2: %d: %s\n", \ ret, err->message); } root = xmlDocGetRootElement(parser->myDoc); walkTree(parser->myDoc, root, text); return 0; }
static void parseHtml(const std::string &html, std::string &title) { htmlParserCtxtPtr ctxt; Context context; ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt, html.c_str(), html.size(), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); title = context.title; }
EXPORT_C #endif gchar * parse_sami (ParserState * state, const gchar * line) { gchar *fixed_line; GstSamiContext *context = (GstSamiContext *) state->user_data; fixed_line = fix_invalid_entities (line); htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0); g_free (fixed_line); if (context->has_result) { gchar *r; if (context->rubybuf->len) { context->rubybuf = g_string_append_c (context->rubybuf, '\n'); g_string_prepend (context->resultbuf, context->rubybuf->str); context->rubybuf = g_string_truncate (context->rubybuf, 0); } r = g_string_free (context->resultbuf, FALSE); context->resultbuf = g_string_new (""); state->start_time = context->time1; state->duration = context->time2 - context->time1; context->has_result = FALSE; return r; } return NULL; }
EXPORT_C #endif void sami_context_deinit (ParserState * state) { GstSamiContext *context = (GstSamiContext *) state->user_data; if (context) { htmlParserCtxtPtr htmlctxt = context->htmlctxt; /* destroy sax context */ htmlDocPtr doc; htmlParseChunk (htmlctxt, "", 0, 1); doc = htmlctxt->myDoc; htmlFreeParserCtxt (htmlctxt); context->htmlctxt = NULL; if (doc) xmlFreeDoc (doc); g_string_free (context->buf, TRUE); g_string_free (context->rubybuf, TRUE); g_string_free (context->resultbuf, TRUE); g_string_free (context->state, TRUE); g_free (context); state->user_data = NULL; } }
void parse(const std::string &page) { htmlParserCtxtPtr ctxt; htmlSAXHandler handler; memset(&handler, 0, sizeof(handler)); handler.startElement = static_parser_start_element; handler.endElement = static_parser_end_element; handler.characters = static_parser_characters; ctxt = htmlCreatePushParserCtxt(&handler, this, "", 0, "", XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt, page.c_str(), page.size(), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); }
static void _unhtmlize (gchar *string, ResultBuffer *buffer) { htmlParserCtxtPtr ctxt; htmlSAXHandlerPtr sax_p; sax_p = g_new0 (htmlSAXHandler, 1); sax_p->characters = unhtmlizeHandleCharacters; ctxt = htmlCreatePushParserCtxt (sax_p, buffer, string, strlen (string), "", XML_CHAR_ENCODING_UTF8); htmlParseChunk (ctxt, string, 0, 1); htmlFreeParserCtxt (ctxt); g_free (sax_p); }
/* * call-seq: * native_write(chunk, last_chunk) * * Write +chunk+ to PushParser. +last_chunk+ triggers the end_document handle */ static VALUE native_write(VALUE self, VALUE _chunk, VALUE _last_chunk) { xmlParserCtxtPtr ctx; const char * chunk = NULL; int size = 0; Data_Get_Struct(self, xmlParserCtxt, ctx); if(Qnil != _chunk) { chunk = StringValuePtr(_chunk); size = (int)RSTRING_LEN(_chunk); } if(htmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0)) { if (!(ctx->options & XML_PARSE_RECOVER)) { xmlErrorPtr e = xmlCtxtGetLastError(ctx); Nokogiri_error_raise(NULL, e); } } return self; }
void se_parser::parse_output(char *output, std::vector<search_snippet*> *snippets, const int &count_offset) throw (sp_exception) { _count = count_offset; htmlParserCtxtPtr ctxt = NULL; parser_context pc; pc._parser = this; pc._snippets = snippets; pc._current_snippet = NULL; htmlSAXHandler saxHandler = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, start_element_wrapper, end_element_wrapper, NULL, characters_wrapper, NULL, NULL, NULL, NULL, NULL, NULL, NULL, cdata_wrapper, NULL, NULL, NULL, NULL, NULL, NULL }; //mutex_lock(&se_parser::_se_parser_mutex); int status = 0; try { ctxt = htmlCreatePushParserCtxt(&saxHandler, &pc, "", 0, "", XML_CHAR_ENCODING_UTF8); // encoding here. htmlCtxtUseOptions(ctxt,HTML_PARSE_NOERROR); status = htmlParseChunk(ctxt,output,strlen(output),0); } catch (std::exception e) { errlog::log_error(LOG_LEVEL_PARSER,"Error %s in xml/html parsing of search results.", e.what()); throw sp_exception(WB_ERR_PARSE,e.what()); } catch (...) // catch everything else to avoid crashes. { std::string msg = "Unknown error in xml/html parsing of search results"; errlog::log_error(LOG_LEVEL_PARSER,msg.c_str()); throw sp_exception(WB_ERR_PARSE,msg); } if (status == 0) { if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); } else // an error occurred. { xmlErrorPtr xep = xmlCtxtGetLastError(ctxt); if (xep) { std::string err_msg = std::string(xep->message); miscutil::replace_in_string(err_msg,"\n",""); errlog::log_error(LOG_LEVEL_PARSER, "html level parsing error (libxml2): %s", err_msg.c_str()); // check on error level. if (xep->level == 3) // fatal or recoverable error. { std::string msg = "libxml2 fatal error"; errlog::log_error(LOG_LEVEL_PARSER,msg.c_str()); if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); throw sp_exception(WB_ERR_PARSE,msg); } // XXX: too verbose, and confusing to users. else if (xep->level == 2) { std::string msg = "libxml2 recoverable error"; errlog::log_error(LOG_LEVEL_DEBUG,msg.c_str()); if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); //throw sp_exception(WB_ERR_PARSE,msg); } } } }
void html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context, WorkbookView *wb_view, GsfInput *input) { guint8 const *buf; gsf_off_t size; int len, bomlen; htmlParserCtxtPtr ctxt; htmlDocPtr doc = NULL; xmlCharEncoding enc; GnmHtmlTableCtxt tc; g_return_if_fail (input != NULL); if (gsf_input_seek (input, 0, G_SEEK_SET)) return; size = gsf_input_size (input); if (size >= 4) { size -= 4; buf = gsf_input_read (input, 4, NULL); if (buf != NULL) { enc = xmlDetectCharEncoding(buf, 4); switch (enc) { /* Skip byte order mark */ case XML_CHAR_ENCODING_UCS4BE: case XML_CHAR_ENCODING_UCS4LE: case XML_CHAR_ENCODING_UCS4_2143: case XML_CHAR_ENCODING_UCS4_3412: case XML_CHAR_ENCODING_EBCDIC: bomlen = 4; break; case XML_CHAR_ENCODING_UTF16BE: case XML_CHAR_ENCODING_UTF16LE: bomlen = 2; break; case XML_CHAR_ENCODING_UTF8: if (buf[0] == 0xef) bomlen = 3; else if (buf[0] == 0x3c) bomlen = 4; else bomlen = 0; break; case XML_CHAR_ENCODING_NONE: bomlen = 0; /* Try to detect unmarked UTF16LE (Firefox Windows clipboard, drag data all platforms) */ if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) && buf[1] == 0 && (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) && buf[3] == 0) enc = XML_CHAR_ENCODING_UTF16LE; break; default: bomlen = 0; } ctxt = htmlCreatePushParserCtxt ( NULL, NULL, (char const *)(buf + bomlen), 4 - bomlen, gsf_input_name (input), enc); for (; size > 0 ; size -= len) { len = MIN (4096, size); buf = gsf_input_read (input, len, NULL); if (buf == NULL) break; htmlParseChunk ( ctxt, (char const *)buf, len, 0); } htmlParseChunk (ctxt, (char const *)buf, 0, 1); doc = ctxt->myDoc; htmlFreeParserCtxt (ctxt); } } if (doc != NULL) { xmlNodePtr ptr; tc.sheet = NULL; tc.row = -1; tc.wb_view = wb_view; for (ptr = doc->children; ptr != NULL ; ptr = ptr->next) html_search_for_tables (ptr, doc, wb_view, &tc); xmlFreeDoc (doc); } else go_io_error_info_set (io_context, go_error_info_new_str (_("Unable to parse the html."))); }
void HttpPullData::parseHtml(const std::string &html,int pos,PARSE_ACTION action){ htmlParserCtxtPtr ctxt; Context context; ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt,html.c_str()+pos,max((unsigned long)0,html.size()-pos), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); switch(action){ case GET_OPTIONS: options=context.options; break; case GET_OZONE: ozone=-1; for(unsigned int i=0;i<context.tableText.size();i++){ if(context.tableText[i].find(" DU",0)>=0){ ozone=atof(StringTools::stripString(context.tableText[i]," DU").c_str()); break; } } break; case GET_LOCATION: for(int i=0;i<context.boldText.size();i++){ if(context.boldText[i]=="Ecoregion "){ info.ecoregion=context.boldText[i+1]; } else if(context.boldText[i]=="Latitude"){ info.latitude=atof(StringTools::stripString(context.boldText[i+1]," N").c_str()); } else if(context.boldText[i]=="Longitude"){ if((int)context.boldText[i+1].find(" W")>=0){ info.longitude=atof(StringTools::stripString(context.boldText[i+1]," W").c_str()); } else if((int)context.boldText[i+1].find(" E")>=0){ info.longitude=-atof(StringTools::stripString(context.boldText[i+1]," E").c_str()); } } else if(context.boldText[i]=="Elevation"){ info.elevation=atof(context.boldText[i+1].c_str()); } } case GET_AEROSOL_VISIBLE: for(int i=0;i<context.tableText.size();i++){ if(context.tableText[i]==" Morning "){ aerosols.depthsAM[2]=StringTools::stof(context.tableText[i+1]); aerosols.depthsAM[3]=StringTools::stof(context.tableText[i+2]); aerosols.depthsAM[4]=StringTools::stof(context.tableText[i+3]); aerosols.depthsAM[5]=StringTools::stof(context.tableText[i+4]); aerosols.depthsAM[6]=StringTools::stof(context.tableText[i+5]); } if(context.tableText[i]==" Afternoon "){ aerosols.depthsPM[2]=StringTools::stof(context.tableText[i+1]); aerosols.depthsPM[3]=StringTools::stof(context.tableText[i+2]); aerosols.depthsPM[4]=StringTools::stof(context.tableText[i+3]); aerosols.depthsPM[5]=StringTools::stof(context.tableText[i+4]); aerosols.depthsPM[6]=StringTools::stof(context.tableText[i+5]); } } break; case GET_AEROSOL_UV: for(int i=0;i<context.tableText.size();i++){ if(context.tableText[i]==" Morning "){ aerosols.depthsAM[0]=StringTools::stof(context.tableText[i+1]); aerosols.depthsAM[1]=StringTools::stof(context.tableText[i+2]); } if(context.tableText[i]==" Afternoon "){ aerosols.depthsPM[0]=StringTools::stof(context.tableText[i+1]); aerosols.depthsPM[1]=StringTools::stof(context.tableText[i+2]); } } break; default: break; } }
static void consume_buffer(saxctxt * ctx, const char *inbuf, int bytes, int flag) { htmlParseChunk(ctx->parser, inbuf, bytes, flag); }