Beispiel #1
0
int
html2text(char *text, const char *content)
{
	int ret;
	xmlNodePtr root;
	xmlErrorPtr err;

	htmlParserCtxtPtr parser;

	parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0);

/*	htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); */
	htmlCtxtUseOptions(parser, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	
	
	ret = htmlParseChunk(parser, content, xmlStrlen(content), 0);
	if (ret != 0) {
		err = xmlCtxtGetLastError(parser);
		fprintf(stderr, "htmlParseChunk failure: %d: %s\n", \
			ret, err->message);
	}
	
	ret = htmlParseChunk(parser, NULL, 0, 1);
	if (ret != 0) {
		err = xmlCtxtGetLastError(parser);
		fprintf(stderr, "htmlParseChunk failure 2: %d: %s\n", \
			ret, err->message);
	}
	
	root = xmlDocGetRootElement(parser->myDoc);
	walkTree(parser->myDoc, root, text);
	
	return 0;
}
Beispiel #2
0
  void se_parser::parse_output(char *output,
                               std::vector<search_snippet*> *snippets,
                               const int &count_offset) throw (sp_exception)
  {
    _count = count_offset;

    htmlParserCtxtPtr ctxt = NULL;
    parser_context pc;
    pc._parser = this;
    pc._snippets = snippets;
    pc._current_snippet = NULL;

    htmlSAXHandler saxHandler =
    {
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      start_element_wrapper,
      end_element_wrapper,
      NULL,
      characters_wrapper,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      cdata_wrapper,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL,
      NULL
    };

    //mutex_lock(&se_parser::_se_parser_mutex);

    int status = 0;
    try
      {
        ctxt = htmlCreatePushParserCtxt(&saxHandler, &pc, "", 0, "",
                                        XML_CHAR_ENCODING_UTF8); // encoding here.
        htmlCtxtUseOptions(ctxt,HTML_PARSE_NOERROR);

        status = htmlParseChunk(ctxt,output,strlen(output),0);
      }
    catch (std::exception e)
      {
        errlog::log_error(LOG_LEVEL_PARSER,"Error %s in xml/html parsing of search results.",
                          e.what());
        throw sp_exception(WB_ERR_PARSE,e.what());
      }
    catch (...) // catch everything else to avoid crashes.
      {
        std::string msg = "Unknown error in xml/html parsing of search results";
        errlog::log_error(LOG_LEVEL_PARSER,msg.c_str());
        throw sp_exception(WB_ERR_PARSE,msg);
      }

    if (status == 0)
      {
        if (ctxt)
          xmlFreeParserCtxt(ctxt);
        //mutex_unlock(&se_parser::_se_parser_mutex);
      }
    else // an error occurred.
      {
        xmlErrorPtr xep = xmlCtxtGetLastError(ctxt);
        if (xep)
          {
            std::string err_msg = std::string(xep->message);
            miscutil::replace_in_string(err_msg,"\n","");
            errlog::log_error(LOG_LEVEL_PARSER, "html level parsing error (libxml2): %s",
                              err_msg.c_str());
            // check on error level.
            if (xep->level == 3) // fatal or recoverable error.
              {
                std::string msg = "libxml2 fatal error";
                errlog::log_error(LOG_LEVEL_PARSER,msg.c_str());
                if (ctxt)
                  xmlFreeParserCtxt(ctxt);
                //mutex_unlock(&se_parser::_se_parser_mutex);
                throw sp_exception(WB_ERR_PARSE,msg);
              }
            // XXX: too verbose, and confusing to users.
            else if (xep->level == 2)
              {
                std::string msg = "libxml2 recoverable error";
                errlog::log_error(LOG_LEVEL_DEBUG,msg.c_str());
                if (ctxt)
                  xmlFreeParserCtxt(ctxt);
                //mutex_unlock(&se_parser::_se_parser_mutex);
                //throw sp_exception(WB_ERR_PARSE,msg);
              }
          }
      }

  }