Beispiel #1
0
int
html2text(char *text, const char *content)
{
	int ret;
	xmlNodePtr root;
	xmlErrorPtr err;

	htmlParserCtxtPtr parser;

	parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0);

/*	htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); */
	htmlCtxtUseOptions(parser, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	
	
	ret = htmlParseChunk(parser, content, xmlStrlen(content), 0);
	if (ret != 0) {
		err = xmlCtxtGetLastError(parser);
		fprintf(stderr, "htmlParseChunk failure: %d: %s\n", \
			ret, err->message);
	}
	
	ret = htmlParseChunk(parser, NULL, 0, 1);
	if (ret != 0) {
		err = xmlCtxtGetLastError(parser);
		fprintf(stderr, "htmlParseChunk failure 2: %d: %s\n", \
			ret, err->message);
	}
	
	root = xmlDocGetRootElement(parser->myDoc);
	walkTree(parser->myDoc, root, text);
	
	return 0;
}
/*
 * call-seq:
 *  initialize_native(xml_sax, filename)
 *
 * Initialize the push parser with +xml_sax+ using +filename+
 */
static VALUE initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
			       VALUE encoding)
{
  htmlSAXHandlerPtr sax;
  const char * filename = NULL;
  htmlParserCtxtPtr ctx;
  xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;

  Data_Get_Struct(_xml_sax, xmlSAXHandler, sax);

  if(_filename != Qnil) filename = StringValuePtr(_filename);

  if (!NIL_P(encoding)) {
    enc = xmlParseCharEncoding(StringValuePtr(encoding));
    if (enc == XML_CHAR_ENCODING_ERROR)
      rb_raise(rb_eArgError, "Unsupported Encoding");
  }

  ctx = htmlCreatePushParserCtxt(
      sax,
      NULL,
      NULL,
      0,
      filename,
      enc
  );
  if(ctx == NULL)
    rb_raise(rb_eRuntimeError, "Could not create a parser context");

  ctx->userData = NOKOGIRI_SAX_TUPLE_NEW(ctx, self);

  ctx->sax2 = 1;
  DATA_PTR(self) = ctx;
  return self;
}
Beispiel #3
0
static void
_unhtmlize (gchar *string, ResultBuffer *buffer)
{
	htmlParserCtxtPtr ctxt;
	htmlSAXHandlerPtr sax_p;

	sax_p = g_new0 (htmlSAXHandler, 1);
 	sax_p->characters = unhtmlizeHandleCharacters;
	ctxt = htmlCreatePushParserCtxt (sax_p, buffer, string, strlen (string), "", XML_CHAR_ENCODING_UTF8);
	htmlParseChunk (ctxt, string, 0, 1);
	htmlFreeParserCtxt (ctxt);
 	g_free (sax_p);
}
Beispiel #4
0
static void parseHtml(const std::string &html,
                      std::string &title)
{
  htmlParserCtxtPtr ctxt;
  Context context;

  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
                                  XML_CHAR_ENCODING_NONE);

  htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
  htmlParseChunk(ctxt, "", 0, 1);

  htmlFreeParserCtxt(ctxt);

  title = context.title;
}
Beispiel #5
0
void
sami_context_init (ParserState * state)
{
  GstSamiContext *context;

  g_assert (state->user_data == NULL);
  state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
  context = (GstSamiContext *) state->user_data;

  context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
      "", 0, NULL, XML_CHAR_ENCODING_UTF8);
  context->buf = g_string_new ("");
  context->rubybuf = g_string_new ("");
  context->resultbuf = g_string_new ("");
  context->state = g_string_new ("");
}
Beispiel #6
0
    void parse(const std::string &page) {
        htmlParserCtxtPtr ctxt;

        htmlSAXHandler handler;
        memset(&handler, 0, sizeof(handler));

        handler.startElement = static_parser_start_element;
        handler.endElement = static_parser_end_element;
        handler.characters = static_parser_characters;

        ctxt = htmlCreatePushParserCtxt(&handler, this, "", 0, "", XML_CHAR_ENCODING_NONE);

        htmlParseChunk(ctxt, page.c_str(), page.size(), 0);
        htmlParseChunk(ctxt, "", 0, 1);

        htmlFreeParserCtxt(ctxt);
    }
Beispiel #7
0
static int create_parser(saxctxt *ctxt, const char **bufp, apr_size_t *bytes)
{
  xmlCharEncoding enc;
#ifndef USE_OLD_LIBXML2
  int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
    XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING;
#endif
  enc = sniff_encoding(ctxt, *bufp, *bytes);
  ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, *bufp, 0, 0, enc);
  if (ctxt->parser == NULL)
    return 0;
  apr_pool_cleanup_register
    (ctxt->f->r->pool, ctxt->parser, (int(*)(void*))htmlFreeParserCtxt,
     apr_pool_cleanup_null);
#ifndef USE_OLD_LIBXML2
  if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts)
    ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, ctxt->f->r,
                 "create_parser: unsupported parser opts %x", xmlopts);
#endif
  return 1;
}
Beispiel #8
0
  void se_parser::parse_output(char *output,
                               std::vector<search_snippet*> *snippets,
                               const int &count_offset) throw (sp_exception)
  {
    _count = count_offset;

    htmlParserCtxtPtr ctxt = NULL;
    parser_context pc;
    pc._parser = this;
    pc._snippets = snippets;
    pc._current_snippet = NULL;

    htmlSAXHandler saxHandler =
    {
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      start_element_wrapper,
      end_element_wrapper,
      NULL,
      characters_wrapper,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      cdata_wrapper,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL
    };

    //mutex_lock(&se_parser::_se_parser_mutex);

    int status = 0;
    try
      {
        ctxt = htmlCreatePushParserCtxt(&saxHandler, &pc, "", 0, "",
                                        XML_CHAR_ENCODING_UTF8); // encoding here.
        htmlCtxtUseOptions(ctxt,HTML_PARSE_NOERROR);

        status = htmlParseChunk(ctxt,output,strlen(output),0);
      }
    catch (std::exception e)
      {
        errlog::log_error(LOG_LEVEL_PARSER,"Error %s in xml/html parsing of search results.",
                          e.what());
        throw sp_exception(WB_ERR_PARSE,e.what());
      }
    catch (...) // catch everything else to avoid crashes.
      {
        std::string msg = "Unknown error in xml/html parsing of search results";
        errlog::log_error(LOG_LEVEL_PARSER,msg.c_str());
        throw sp_exception(WB_ERR_PARSE,msg);
      }

    if (status == 0)
      {
        if (ctxt)
          xmlFreeParserCtxt(ctxt);
        //mutex_unlock(&se_parser::_se_parser_mutex);
      }
    else // an error occurred.
      {
        xmlErrorPtr xep = xmlCtxtGetLastError(ctxt);
        if (xep)
          {
            std::string err_msg = std::string(xep->message);
            miscutil::replace_in_string(err_msg,"\n","");
            errlog::log_error(LOG_LEVEL_PARSER, "html level parsing error (libxml2): %s",
                              err_msg.c_str());
            // check on error level.
            if (xep->level == 3) // fatal or recoverable error.
              {
                std::string msg = "libxml2 fatal error";
                errlog::log_error(LOG_LEVEL_PARSER,msg.c_str());
                if (ctxt)
                  xmlFreeParserCtxt(ctxt);
                //mutex_unlock(&se_parser::_se_parser_mutex);
                throw sp_exception(WB_ERR_PARSE,msg);
              }
            // XXX: too verbose, and confusing to users.
            else if (xep->level == 2)
              {
                std::string msg = "libxml2 recoverable error";
                errlog::log_error(LOG_LEVEL_DEBUG,msg.c_str());
                if (ctxt)
                  xmlFreeParserCtxt(ctxt);
                //mutex_unlock(&se_parser::_se_parser_mutex);
                //throw sp_exception(WB_ERR_PARSE,msg);
              }
          }
      }

  }
Beispiel #9
0
void
html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
		WorkbookView *wb_view, GsfInput *input)
{
	guint8 const *buf;
	gsf_off_t size;
	int len, bomlen;
	htmlParserCtxtPtr ctxt;
	htmlDocPtr doc = NULL;
	xmlCharEncoding enc;
	GnmHtmlTableCtxt tc;

	g_return_if_fail (input != NULL);

	if (gsf_input_seek (input, 0, G_SEEK_SET))
		return;

	size = gsf_input_size (input);
	if (size >= 4) {
		size -= 4;
		buf = gsf_input_read (input, 4, NULL);
		if (buf != NULL) {
			enc = xmlDetectCharEncoding(buf, 4);
			switch (enc) {	/* Skip byte order mark */
			case XML_CHAR_ENCODING_UCS4BE:
			case XML_CHAR_ENCODING_UCS4LE:
			case XML_CHAR_ENCODING_UCS4_2143:
			case XML_CHAR_ENCODING_UCS4_3412:
			case XML_CHAR_ENCODING_EBCDIC:
				bomlen = 4;
				break;
			case XML_CHAR_ENCODING_UTF16BE:
			case XML_CHAR_ENCODING_UTF16LE:
				bomlen = 2;
				break;
			case XML_CHAR_ENCODING_UTF8:
				if (buf[0] == 0xef)
					bomlen = 3;
				else if (buf[0] == 0x3c)
					bomlen = 4;
				else
					bomlen = 0;
				break;
			case XML_CHAR_ENCODING_NONE:
				bomlen = 0;
				/* Try to detect unmarked UTF16LE
				   (Firefox Windows clipboard, drag data all platforms) */
				if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) &&
				    buf[1] == 0 &&
				    (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) &&
				    buf[3] == 0)
					enc =  XML_CHAR_ENCODING_UTF16LE;
				break;
			default:
				bomlen = 0;
			}
			ctxt = htmlCreatePushParserCtxt (
				NULL, NULL, (char const *)(buf + bomlen),
				4 - bomlen, gsf_input_name (input), enc);

			for (; size > 0 ; size -= len) {
				len = MIN (4096, size);
				buf = gsf_input_read (input, len, NULL);
				if (buf == NULL)
					break;
				htmlParseChunk (
					ctxt, (char const *)buf, len, 0);
			}

			htmlParseChunk (ctxt, (char const *)buf, 0, 1);
			doc = ctxt->myDoc;
			htmlFreeParserCtxt (ctxt);
		}
	}

	if (doc != NULL) {
		xmlNodePtr ptr;
		tc.sheet = NULL;
		tc.row   = -1;
		tc.wb_view = wb_view;
		for (ptr = doc->children; ptr != NULL ; ptr = ptr->next)
			html_search_for_tables (ptr, doc, wb_view, &tc);
		xmlFreeDoc (doc);
	} else
		go_io_error_info_set (io_context,
			go_error_info_new_str (_("Unable to parse the html.")));
}
Beispiel #10
0
	void HttpPullData::parseHtml(const std::string &html,int pos,PARSE_ACTION action){
	  htmlParserCtxtPtr ctxt;
	  Context context;
	  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",XML_CHAR_ENCODING_NONE);
	  htmlParseChunk(ctxt,html.c_str()+pos,max((unsigned long)0,html.size()-pos), 0);
	  htmlParseChunk(ctxt, "", 0, 1);
	  htmlFreeParserCtxt(ctxt);
	  switch(action){
	  	case GET_OPTIONS:
	  		options=context.options;
	  		break;
	 	case GET_OZONE:
	 		ozone=-1;
	  		for(unsigned int i=0;i<context.tableText.size();i++){
	  			if(context.tableText[i].find(" DU",0)>=0){
	  				ozone=atof(StringTools::stripString(context.tableText[i]," DU").c_str());
	  				break;
	  			}
	  		}	 	
	  		break;
	  	case GET_LOCATION:
	  		for(int i=0;i<context.boldText.size();i++){
	  			if(context.boldText[i]=="Ecoregion "){
	  				info.ecoregion=context.boldText[i+1];
	  			} else if(context.boldText[i]=="Latitude"){
	  	  			info.latitude=atof(StringTools::stripString(context.boldText[i+1]," N").c_str());
	  			} else if(context.boldText[i]=="Longitude"){
	  				if((int)context.boldText[i+1].find(" W")>=0){
	  					info.longitude=atof(StringTools::stripString(context.boldText[i+1]," W").c_str());
	  				} else if((int)context.boldText[i+1].find(" E")>=0){
	  					info.longitude=-atof(StringTools::stripString(context.boldText[i+1]," E").c_str());
	  				}
	  			} else if(context.boldText[i]=="Elevation"){
	  				info.elevation=atof(context.boldText[i+1].c_str());
	  			}
	  		}
	  	case GET_AEROSOL_VISIBLE:
	  		for(int i=0;i<context.tableText.size();i++){
	  			if(context.tableText[i]==" Morning  "){
	  				aerosols.depthsAM[2]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsAM[3]=StringTools::stof(context.tableText[i+2]);
	  				aerosols.depthsAM[4]=StringTools::stof(context.tableText[i+3]);
	  				aerosols.depthsAM[5]=StringTools::stof(context.tableText[i+4]);
	  				aerosols.depthsAM[6]=StringTools::stof(context.tableText[i+5]);
	  			}
	  			if(context.tableText[i]==" Afternoon  "){
	  				aerosols.depthsPM[2]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsPM[3]=StringTools::stof(context.tableText[i+2]);
	  				aerosols.depthsPM[4]=StringTools::stof(context.tableText[i+3]);
	  				aerosols.depthsPM[5]=StringTools::stof(context.tableText[i+4]);
	  				aerosols.depthsPM[6]=StringTools::stof(context.tableText[i+5]);
	  			}
	  			
			}
			break;
	  	case GET_AEROSOL_UV:
	  		for(int i=0;i<context.tableText.size();i++){
	  			if(context.tableText[i]==" Morning  "){
	  				aerosols.depthsAM[0]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsAM[1]=StringTools::stof(context.tableText[i+2]);
	  			}
	  			if(context.tableText[i]==" Afternoon  "){
	  				aerosols.depthsPM[0]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsPM[1]=StringTools::stof(context.tableText[i+2]);
	  			}
	  			
			}
			break;
		default:
	  		break;
	  }
	}