int html2text(char *text, const char *content) { int ret; xmlNodePtr root; xmlErrorPtr err; htmlParserCtxtPtr parser; parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0); /* htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); */ htmlCtxtUseOptions(parser, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); ret = htmlParseChunk(parser, content, xmlStrlen(content), 0); if (ret != 0) { err = xmlCtxtGetLastError(parser); fprintf(stderr, "htmlParseChunk failure: %d: %s\n", \ ret, err->message); } ret = htmlParseChunk(parser, NULL, 0, 1); if (ret != 0) { err = xmlCtxtGetLastError(parser); fprintf(stderr, "htmlParseChunk failure 2: %d: %s\n", \ ret, err->message); } root = xmlDocGetRootElement(parser->myDoc); walkTree(parser->myDoc, root, text); return 0; }
/* * call-seq: * initialize_native(xml_sax, filename) * * Initialize the push parser with +xml_sax+ using +filename+ */ static VALUE initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename, VALUE encoding) { htmlSAXHandlerPtr sax; const char * filename = NULL; htmlParserCtxtPtr ctx; xmlCharEncoding enc = XML_CHAR_ENCODING_NONE; Data_Get_Struct(_xml_sax, xmlSAXHandler, sax); if(_filename != Qnil) filename = StringValuePtr(_filename); if (!NIL_P(encoding)) { enc = xmlParseCharEncoding(StringValuePtr(encoding)); if (enc == XML_CHAR_ENCODING_ERROR) rb_raise(rb_eArgError, "Unsupported Encoding"); } ctx = htmlCreatePushParserCtxt( sax, NULL, NULL, 0, filename, enc ); if(ctx == NULL) rb_raise(rb_eRuntimeError, "Could not create a parser context"); ctx->userData = NOKOGIRI_SAX_TUPLE_NEW(ctx, self); ctx->sax2 = 1; DATA_PTR(self) = ctx; return self; }
static void _unhtmlize (gchar *string, ResultBuffer *buffer) { htmlParserCtxtPtr ctxt; htmlSAXHandlerPtr sax_p; sax_p = g_new0 (htmlSAXHandler, 1); sax_p->characters = unhtmlizeHandleCharacters; ctxt = htmlCreatePushParserCtxt (sax_p, buffer, string, strlen (string), "", XML_CHAR_ENCODING_UTF8); htmlParseChunk (ctxt, string, 0, 1); htmlFreeParserCtxt (ctxt); g_free (sax_p); }
static void parseHtml(const std::string &html, std::string &title) { htmlParserCtxtPtr ctxt; Context context; ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt, html.c_str(), html.size(), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); title = context.title; }
void sami_context_init (ParserState * state) { GstSamiContext *context; g_assert (state->user_data == NULL); state->user_data = (gpointer) g_new0 (GstSamiContext, 1); context = (GstSamiContext *) state->user_data; context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context, "", 0, NULL, XML_CHAR_ENCODING_UTF8); context->buf = g_string_new (""); context->rubybuf = g_string_new (""); context->resultbuf = g_string_new (""); context->state = g_string_new (""); }
void parse(const std::string &page) { htmlParserCtxtPtr ctxt; htmlSAXHandler handler; memset(&handler, 0, sizeof(handler)); handler.startElement = static_parser_start_element; handler.endElement = static_parser_end_element; handler.characters = static_parser_characters; ctxt = htmlCreatePushParserCtxt(&handler, this, "", 0, "", XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt, page.c_str(), page.size(), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); }
static int create_parser(saxctxt *ctxt, const char **bufp, apr_size_t *bytes) { xmlCharEncoding enc; #ifndef USE_OLD_LIBXML2 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET | XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING; #endif enc = sniff_encoding(ctxt, *bufp, *bytes); ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, *bufp, 0, 0, enc); if (ctxt->parser == NULL) return 0; apr_pool_cleanup_register (ctxt->f->r->pool, ctxt->parser, (int(*)(void*))htmlFreeParserCtxt, apr_pool_cleanup_null); #ifndef USE_OLD_LIBXML2 if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts) ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, ctxt->f->r, "create_parser: unsupported parser opts %x", xmlopts); #endif return 1; }
void se_parser::parse_output(char *output, std::vector<search_snippet*> *snippets, const int &count_offset) throw (sp_exception) { _count = count_offset; htmlParserCtxtPtr ctxt = NULL; parser_context pc; pc._parser = this; pc._snippets = snippets; pc._current_snippet = NULL; htmlSAXHandler saxHandler = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, start_element_wrapper, end_element_wrapper, NULL, characters_wrapper, NULL, NULL, NULL, NULL, NULL, NULL, NULL, cdata_wrapper, NULL, NULL, NULL, NULL, NULL, NULL }; //mutex_lock(&se_parser::_se_parser_mutex); int status = 0; try { ctxt = htmlCreatePushParserCtxt(&saxHandler, &pc, "", 0, "", XML_CHAR_ENCODING_UTF8); // encoding here. htmlCtxtUseOptions(ctxt,HTML_PARSE_NOERROR); status = htmlParseChunk(ctxt,output,strlen(output),0); } catch (std::exception e) { errlog::log_error(LOG_LEVEL_PARSER,"Error %s in xml/html parsing of search results.", e.what()); throw sp_exception(WB_ERR_PARSE,e.what()); } catch (...) // catch everything else to avoid crashes. { std::string msg = "Unknown error in xml/html parsing of search results"; errlog::log_error(LOG_LEVEL_PARSER,msg.c_str()); throw sp_exception(WB_ERR_PARSE,msg); } if (status == 0) { if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); } else // an error occurred. { xmlErrorPtr xep = xmlCtxtGetLastError(ctxt); if (xep) { std::string err_msg = std::string(xep->message); miscutil::replace_in_string(err_msg,"\n",""); errlog::log_error(LOG_LEVEL_PARSER, "html level parsing error (libxml2): %s", err_msg.c_str()); // check on error level. if (xep->level == 3) // fatal or recoverable error. { std::string msg = "libxml2 fatal error"; errlog::log_error(LOG_LEVEL_PARSER,msg.c_str()); if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); throw sp_exception(WB_ERR_PARSE,msg); } // XXX: too verbose, and confusing to users. else if (xep->level == 2) { std::string msg = "libxml2 recoverable error"; errlog::log_error(LOG_LEVEL_DEBUG,msg.c_str()); if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); //throw sp_exception(WB_ERR_PARSE,msg); } } } }
void html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context, WorkbookView *wb_view, GsfInput *input) { guint8 const *buf; gsf_off_t size; int len, bomlen; htmlParserCtxtPtr ctxt; htmlDocPtr doc = NULL; xmlCharEncoding enc; GnmHtmlTableCtxt tc; g_return_if_fail (input != NULL); if (gsf_input_seek (input, 0, G_SEEK_SET)) return; size = gsf_input_size (input); if (size >= 4) { size -= 4; buf = gsf_input_read (input, 4, NULL); if (buf != NULL) { enc = xmlDetectCharEncoding(buf, 4); switch (enc) { /* Skip byte order mark */ case XML_CHAR_ENCODING_UCS4BE: case XML_CHAR_ENCODING_UCS4LE: case XML_CHAR_ENCODING_UCS4_2143: case XML_CHAR_ENCODING_UCS4_3412: case XML_CHAR_ENCODING_EBCDIC: bomlen = 4; break; case XML_CHAR_ENCODING_UTF16BE: case XML_CHAR_ENCODING_UTF16LE: bomlen = 2; break; case XML_CHAR_ENCODING_UTF8: if (buf[0] == 0xef) bomlen = 3; else if (buf[0] == 0x3c) bomlen = 4; else bomlen = 0; break; case XML_CHAR_ENCODING_NONE: bomlen = 0; /* Try to detect unmarked UTF16LE (Firefox Windows clipboard, drag data all platforms) */ if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) && buf[1] == 0 && (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) && buf[3] == 0) enc = XML_CHAR_ENCODING_UTF16LE; break; default: bomlen = 0; } ctxt = htmlCreatePushParserCtxt ( NULL, NULL, (char const *)(buf + bomlen), 4 - bomlen, gsf_input_name (input), enc); for (; size > 0 ; size -= len) { len = MIN (4096, size); buf = gsf_input_read (input, len, NULL); if (buf == NULL) break; htmlParseChunk ( ctxt, (char const *)buf, len, 0); } htmlParseChunk (ctxt, (char const *)buf, 0, 1); doc = ctxt->myDoc; htmlFreeParserCtxt (ctxt); } } if (doc != NULL) { xmlNodePtr ptr; tc.sheet = NULL; tc.row = -1; tc.wb_view = wb_view; for (ptr = doc->children; ptr != NULL ; ptr = ptr->next) html_search_for_tables (ptr, doc, wb_view, &tc); xmlFreeDoc (doc); } else go_io_error_info_set (io_context, go_error_info_new_str (_("Unable to parse the html."))); }
void HttpPullData::parseHtml(const std::string &html,int pos,PARSE_ACTION action){ htmlParserCtxtPtr ctxt; Context context; ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt,html.c_str()+pos,max((unsigned long)0,html.size()-pos), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); switch(action){ case GET_OPTIONS: options=context.options; break; case GET_OZONE: ozone=-1; for(unsigned int i=0;i<context.tableText.size();i++){ if(context.tableText[i].find(" DU",0)>=0){ ozone=atof(StringTools::stripString(context.tableText[i]," DU").c_str()); break; } } break; case GET_LOCATION: for(int i=0;i<context.boldText.size();i++){ if(context.boldText[i]=="Ecoregion "){ info.ecoregion=context.boldText[i+1]; } else if(context.boldText[i]=="Latitude"){ info.latitude=atof(StringTools::stripString(context.boldText[i+1]," N").c_str()); } else if(context.boldText[i]=="Longitude"){ if((int)context.boldText[i+1].find(" W")>=0){ info.longitude=atof(StringTools::stripString(context.boldText[i+1]," W").c_str()); } else if((int)context.boldText[i+1].find(" E")>=0){ info.longitude=-atof(StringTools::stripString(context.boldText[i+1]," E").c_str()); } } else if(context.boldText[i]=="Elevation"){ info.elevation=atof(context.boldText[i+1].c_str()); } } case GET_AEROSOL_VISIBLE: for(int i=0;i<context.tableText.size();i++){ if(context.tableText[i]==" Morning "){ aerosols.depthsAM[2]=StringTools::stof(context.tableText[i+1]); aerosols.depthsAM[3]=StringTools::stof(context.tableText[i+2]); aerosols.depthsAM[4]=StringTools::stof(context.tableText[i+3]); aerosols.depthsAM[5]=StringTools::stof(context.tableText[i+4]); aerosols.depthsAM[6]=StringTools::stof(context.tableText[i+5]); } if(context.tableText[i]==" Afternoon "){ aerosols.depthsPM[2]=StringTools::stof(context.tableText[i+1]); aerosols.depthsPM[3]=StringTools::stof(context.tableText[i+2]); aerosols.depthsPM[4]=StringTools::stof(context.tableText[i+3]); aerosols.depthsPM[5]=StringTools::stof(context.tableText[i+4]); aerosols.depthsPM[6]=StringTools::stof(context.tableText[i+5]); } } break; case GET_AEROSOL_UV: for(int i=0;i<context.tableText.size();i++){ if(context.tableText[i]==" Morning "){ aerosols.depthsAM[0]=StringTools::stof(context.tableText[i+1]); aerosols.depthsAM[1]=StringTools::stof(context.tableText[i+2]); } if(context.tableText[i]==" Afternoon "){ aerosols.depthsPM[0]=StringTools::stof(context.tableText[i+1]); aerosols.depthsPM[1]=StringTools::stof(context.tableText[i+2]); } } break; default: break; } }