예제 #1
0
파일: html2text.c 프로젝트: xjtuwjp/farm
int
html2text(char *text, const char *content)
{
	int ret;
	xmlNodePtr root;
	xmlErrorPtr err;

	htmlParserCtxtPtr parser;

	parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0);

/*	htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); */
	htmlCtxtUseOptions(parser, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	
	
	ret = htmlParseChunk(parser, content, xmlStrlen(content), 0);
	if (ret != 0) {
		err = xmlCtxtGetLastError(parser);
		fprintf(stderr, "htmlParseChunk failure: %d: %s\n", \
			ret, err->message);
	}
	
	ret = htmlParseChunk(parser, NULL, 0, 1);
	if (ret != 0) {
		err = xmlCtxtGetLastError(parser);
		fprintf(stderr, "htmlParseChunk failure 2: %d: %s\n", \
			ret, err->message);
	}
	
	root = xmlDocGetRootElement(parser->myDoc);
	walkTree(parser->myDoc, root, text);
	
	return 0;
}
예제 #2
0
파일: htmltitle.cpp 프로젝트: 08142008/curl
static void parseHtml(const std::string &html,
                      std::string &title)
{
  htmlParserCtxtPtr ctxt;
  Context context;

  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
                                  XML_CHAR_ENCODING_NONE);

  htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
  htmlParseChunk(ctxt, "", 0, 1);

  htmlFreeParserCtxt(ctxt);

  title = context.title;
}
예제 #3
0
EXPORT_C
#endif


gchar *
parse_sami (ParserState * state, const gchar * line)
{
  gchar *fixed_line;
  GstSamiContext *context = (GstSamiContext *) state->user_data;

  fixed_line = fix_invalid_entities (line);
  htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
  g_free (fixed_line);

  if (context->has_result) {
    gchar *r;

    if (context->rubybuf->len) {
      context->rubybuf = g_string_append_c (context->rubybuf, '\n');
      g_string_prepend (context->resultbuf, context->rubybuf->str);
      context->rubybuf = g_string_truncate (context->rubybuf, 0);
    }

    r = g_string_free (context->resultbuf, FALSE);
    context->resultbuf = g_string_new ("");
    state->start_time = context->time1;
    state->duration = context->time2 - context->time1;
    context->has_result = FALSE;
    return r;
  }
  return NULL;
}
예제 #4
0
EXPORT_C
#endif


void
sami_context_deinit (ParserState * state)
{
  GstSamiContext *context = (GstSamiContext *) state->user_data;

  if (context) {
    htmlParserCtxtPtr htmlctxt = context->htmlctxt;

    /* destroy sax context */
    htmlDocPtr doc;

    htmlParseChunk (htmlctxt, "", 0, 1);
    doc = htmlctxt->myDoc;
    htmlFreeParserCtxt (htmlctxt);
    context->htmlctxt = NULL;
    if (doc)
      xmlFreeDoc (doc);
    g_string_free (context->buf, TRUE);
    g_string_free (context->rubybuf, TRUE);
    g_string_free (context->resultbuf, TRUE);
    g_string_free (context->state, TRUE);
    g_free (context);
    state->user_data = NULL;
  }
}
예제 #5
0
파일: parser.hpp 프로젝트: kod3r/wookie
    void parse(const std::string &page) {
        htmlParserCtxtPtr ctxt;

        htmlSAXHandler handler;
        memset(&handler, 0, sizeof(handler));

        handler.startElement = static_parser_start_element;
        handler.endElement = static_parser_end_element;
        handler.characters = static_parser_characters;

        ctxt = htmlCreatePushParserCtxt(&handler, this, "", 0, "", XML_CHAR_ENCODING_NONE);

        htmlParseChunk(ctxt, page.c_str(), page.size(), 0);
        htmlParseChunk(ctxt, "", 0, 1);

        htmlFreeParserCtxt(ctxt);
    }
예제 #6
0
파일: utils.c 프로젝트: UIKit0/libgrss
static void
_unhtmlize (gchar *string, ResultBuffer *buffer)
{
	htmlParserCtxtPtr ctxt;
	htmlSAXHandlerPtr sax_p;

	sax_p = g_new0 (htmlSAXHandler, 1);
 	sax_p->characters = unhtmlizeHandleCharacters;
	ctxt = htmlCreatePushParserCtxt (sax_p, buffer, string, strlen (string), "", XML_CHAR_ENCODING_UTF8);
	htmlParseChunk (ctxt, string, 0, 1);
	htmlFreeParserCtxt (ctxt);
 	g_free (sax_p);
}
/*
 * call-seq:
 *  native_write(chunk, last_chunk)
 *
 * Write +chunk+ to PushParser. +last_chunk+ triggers the end_document handle
 */
static VALUE native_write(VALUE self, VALUE _chunk, VALUE _last_chunk)
{
  xmlParserCtxtPtr ctx;
  const char * chunk  = NULL;
  int size            = 0;


  Data_Get_Struct(self, xmlParserCtxt, ctx);

  if(Qnil != _chunk) {
    chunk = StringValuePtr(_chunk);
    size = (int)RSTRING_LEN(_chunk);
  }

  if(htmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0)) {
    if (!(ctx->options & XML_PARSE_RECOVER)) {
      xmlErrorPtr e = xmlCtxtGetLastError(ctx);
      Nokogiri_error_raise(NULL, e);
    }
  }

  return self;
}
예제 #8
0
  void se_parser::parse_output(char *output,
                               std::vector<search_snippet*> *snippets,
                               const int &count_offset) throw (sp_exception)
  {
    _count = count_offset;

    htmlParserCtxtPtr ctxt = NULL;
    parser_context pc;
    pc._parser = this;
    pc._snippets = snippets;
    pc._current_snippet = NULL;

    htmlSAXHandler saxHandler =
    {
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      start_element_wrapper,
      end_element_wrapper,
      NULL,
      characters_wrapper,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      cdata_wrapper,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL
    };

    //mutex_lock(&se_parser::_se_parser_mutex);

    int status = 0;
    try
      {
        ctxt = htmlCreatePushParserCtxt(&saxHandler, &pc, "", 0, "",
                                        XML_CHAR_ENCODING_UTF8); // encoding here.
        htmlCtxtUseOptions(ctxt,HTML_PARSE_NOERROR);

        status = htmlParseChunk(ctxt,output,strlen(output),0);
      }
    catch (std::exception e)
      {
        errlog::log_error(LOG_LEVEL_PARSER,"Error %s in xml/html parsing of search results.",
                          e.what());
        throw sp_exception(WB_ERR_PARSE,e.what());
      }
    catch (...) // catch everything else to avoid crashes.
      {
        std::string msg = "Unknown error in xml/html parsing of search results";
        errlog::log_error(LOG_LEVEL_PARSER,msg.c_str());
        throw sp_exception(WB_ERR_PARSE,msg);
      }

    if (status == 0)
      {
        if (ctxt)
          xmlFreeParserCtxt(ctxt);
        //mutex_unlock(&se_parser::_se_parser_mutex);
      }
    else // an error occurred.
      {
        xmlErrorPtr xep = xmlCtxtGetLastError(ctxt);
        if (xep)
          {
            std::string err_msg = std::string(xep->message);
            miscutil::replace_in_string(err_msg,"\n","");
            errlog::log_error(LOG_LEVEL_PARSER, "html level parsing error (libxml2): %s",
                              err_msg.c_str());
            // check on error level.
            if (xep->level == 3) // fatal or recoverable error.
              {
                std::string msg = "libxml2 fatal error";
                errlog::log_error(LOG_LEVEL_PARSER,msg.c_str());
                if (ctxt)
                  xmlFreeParserCtxt(ctxt);
                //mutex_unlock(&se_parser::_se_parser_mutex);
                throw sp_exception(WB_ERR_PARSE,msg);
              }
            // XXX: too verbose, and confusing to users.
            else if (xep->level == 2)
              {
                std::string msg = "libxml2 recoverable error";
                errlog::log_error(LOG_LEVEL_DEBUG,msg.c_str());
                if (ctxt)
                  xmlFreeParserCtxt(ctxt);
                //mutex_unlock(&se_parser::_se_parser_mutex);
                //throw sp_exception(WB_ERR_PARSE,msg);
              }
          }
      }

  }
예제 #9
0
파일: html_read.c 프로젝트: GNOME/gnumeric
void
html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
		WorkbookView *wb_view, GsfInput *input)
{
	guint8 const *buf;
	gsf_off_t size;
	int len, bomlen;
	htmlParserCtxtPtr ctxt;
	htmlDocPtr doc = NULL;
	xmlCharEncoding enc;
	GnmHtmlTableCtxt tc;

	g_return_if_fail (input != NULL);

	if (gsf_input_seek (input, 0, G_SEEK_SET))
		return;

	size = gsf_input_size (input);
	if (size >= 4) {
		size -= 4;
		buf = gsf_input_read (input, 4, NULL);
		if (buf != NULL) {
			enc = xmlDetectCharEncoding(buf, 4);
			switch (enc) {	/* Skip byte order mark */
			case XML_CHAR_ENCODING_UCS4BE:
			case XML_CHAR_ENCODING_UCS4LE:
			case XML_CHAR_ENCODING_UCS4_2143:
			case XML_CHAR_ENCODING_UCS4_3412:
			case XML_CHAR_ENCODING_EBCDIC:
				bomlen = 4;
				break;
			case XML_CHAR_ENCODING_UTF16BE:
			case XML_CHAR_ENCODING_UTF16LE:
				bomlen = 2;
				break;
			case XML_CHAR_ENCODING_UTF8:
				if (buf[0] == 0xef)
					bomlen = 3;
				else if (buf[0] == 0x3c)
					bomlen = 4;
				else
					bomlen = 0;
				break;
			case XML_CHAR_ENCODING_NONE:
				bomlen = 0;
				/* Try to detect unmarked UTF16LE
				   (Firefox Windows clipboard, drag data all platforms) */
				if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) &&
				    buf[1] == 0 &&
				    (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) &&
				    buf[3] == 0)
					enc =  XML_CHAR_ENCODING_UTF16LE;
				break;
			default:
				bomlen = 0;
			}
			ctxt = htmlCreatePushParserCtxt (
				NULL, NULL, (char const *)(buf + bomlen),
				4 - bomlen, gsf_input_name (input), enc);

			for (; size > 0 ; size -= len) {
				len = MIN (4096, size);
				buf = gsf_input_read (input, len, NULL);
				if (buf == NULL)
					break;
				htmlParseChunk (
					ctxt, (char const *)buf, len, 0);
			}

			htmlParseChunk (ctxt, (char const *)buf, 0, 1);
			doc = ctxt->myDoc;
			htmlFreeParserCtxt (ctxt);
		}
	}

	if (doc != NULL) {
		xmlNodePtr ptr;
		tc.sheet = NULL;
		tc.row   = -1;
		tc.wb_view = wb_view;
		for (ptr = doc->children; ptr != NULL ; ptr = ptr->next)
			html_search_for_tables (ptr, doc, wb_view, &tc);
		xmlFreeDoc (doc);
	} else
		go_io_error_info_set (io_context,
			go_error_info_new_str (_("Unable to parse the html.")));
}
예제 #10
0
	void HttpPullData::parseHtml(const std::string &html,int pos,PARSE_ACTION action){
	  htmlParserCtxtPtr ctxt;
	  Context context;
	  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",XML_CHAR_ENCODING_NONE);
	  htmlParseChunk(ctxt,html.c_str()+pos,max((unsigned long)0,html.size()-pos), 0);
	  htmlParseChunk(ctxt, "", 0, 1);
	  htmlFreeParserCtxt(ctxt);
	  switch(action){
	  	case GET_OPTIONS:
	  		options=context.options;
	  		break;
	 	case GET_OZONE:
	 		ozone=-1;
	  		for(unsigned int i=0;i<context.tableText.size();i++){
	  			if(context.tableText[i].find(" DU",0)>=0){
	  				ozone=atof(StringTools::stripString(context.tableText[i]," DU").c_str());
	  				break;
	  			}
	  		}	 	
	  		break;
	  	case GET_LOCATION:
	  		for(int i=0;i<context.boldText.size();i++){
	  			if(context.boldText[i]=="Ecoregion "){
	  				info.ecoregion=context.boldText[i+1];
	  			} else if(context.boldText[i]=="Latitude"){
	  	  			info.latitude=atof(StringTools::stripString(context.boldText[i+1]," N").c_str());
	  			} else if(context.boldText[i]=="Longitude"){
	  				if((int)context.boldText[i+1].find(" W")>=0){
	  					info.longitude=atof(StringTools::stripString(context.boldText[i+1]," W").c_str());
	  				} else if((int)context.boldText[i+1].find(" E")>=0){
	  					info.longitude=-atof(StringTools::stripString(context.boldText[i+1]," E").c_str());
	  				}
	  			} else if(context.boldText[i]=="Elevation"){
	  				info.elevation=atof(context.boldText[i+1].c_str());
	  			}
	  		}
	  	case GET_AEROSOL_VISIBLE:
	  		for(int i=0;i<context.tableText.size();i++){
	  			if(context.tableText[i]==" Morning  "){
	  				aerosols.depthsAM[2]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsAM[3]=StringTools::stof(context.tableText[i+2]);
	  				aerosols.depthsAM[4]=StringTools::stof(context.tableText[i+3]);
	  				aerosols.depthsAM[5]=StringTools::stof(context.tableText[i+4]);
	  				aerosols.depthsAM[6]=StringTools::stof(context.tableText[i+5]);
	  			}
	  			if(context.tableText[i]==" Afternoon  "){
	  				aerosols.depthsPM[2]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsPM[3]=StringTools::stof(context.tableText[i+2]);
	  				aerosols.depthsPM[4]=StringTools::stof(context.tableText[i+3]);
	  				aerosols.depthsPM[5]=StringTools::stof(context.tableText[i+4]);
	  				aerosols.depthsPM[6]=StringTools::stof(context.tableText[i+5]);
	  			}
	  			
			}
			break;
	  	case GET_AEROSOL_UV:
	  		for(int i=0;i<context.tableText.size();i++){
	  			if(context.tableText[i]==" Morning  "){
	  				aerosols.depthsAM[0]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsAM[1]=StringTools::stof(context.tableText[i+2]);
	  			}
	  			if(context.tableText[i]==" Afternoon  "){
	  				aerosols.depthsPM[0]=StringTools::stof(context.tableText[i+1]);
	  				aerosols.depthsPM[1]=StringTools::stof(context.tableText[i+2]);
	  			}
	  			
			}
			break;
		default:
	  		break;
	  }
	}
예제 #11
0
static void consume_buffer(saxctxt * ctx, const char *inbuf, int bytes, int flag)
{
  htmlParseChunk(ctx->parser, inbuf, bytes, flag);
}