int html2text(char *text, const char *content) { int ret; xmlNodePtr root; xmlErrorPtr err; htmlParserCtxtPtr parser; parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0); /* htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); */ htmlCtxtUseOptions(parser, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); ret = htmlParseChunk(parser, content, xmlStrlen(content), 0); if (ret != 0) { err = xmlCtxtGetLastError(parser); fprintf(stderr, "htmlParseChunk failure: %d: %s\n", \ ret, err->message); } ret = htmlParseChunk(parser, NULL, 0, 1); if (ret != 0) { err = xmlCtxtGetLastError(parser); fprintf(stderr, "htmlParseChunk failure 2: %d: %s\n", \ ret, err->message); } root = xmlDocGetRootElement(parser->myDoc); walkTree(parser->myDoc, root, text); return 0; }
void se_parser::parse_output(char *output, std::vector<search_snippet*> *snippets, const int &count_offset) throw (sp_exception) { _count = count_offset; htmlParserCtxtPtr ctxt = NULL; parser_context pc; pc._parser = this; pc._snippets = snippets; pc._current_snippet = NULL; htmlSAXHandler saxHandler = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, start_element_wrapper, end_element_wrapper, NULL, characters_wrapper, NULL, NULL, NULL, NULL, NULL, NULL, NULL, cdata_wrapper, NULL, NULL, NULL, NULL, NULL, NULL }; //mutex_lock(&se_parser::_se_parser_mutex); int status = 0; try { ctxt = htmlCreatePushParserCtxt(&saxHandler, &pc, "", 0, "", XML_CHAR_ENCODING_UTF8); // encoding here. htmlCtxtUseOptions(ctxt,HTML_PARSE_NOERROR); status = htmlParseChunk(ctxt,output,strlen(output),0); } catch (std::exception e) { errlog::log_error(LOG_LEVEL_PARSER,"Error %s in xml/html parsing of search results.", e.what()); throw sp_exception(WB_ERR_PARSE,e.what()); } catch (...) // catch everything else to avoid crashes. { std::string msg = "Unknown error in xml/html parsing of search results"; errlog::log_error(LOG_LEVEL_PARSER,msg.c_str()); throw sp_exception(WB_ERR_PARSE,msg); } if (status == 0) { if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); } else // an error occurred. { xmlErrorPtr xep = xmlCtxtGetLastError(ctxt); if (xep) { std::string err_msg = std::string(xep->message); miscutil::replace_in_string(err_msg,"\n",""); errlog::log_error(LOG_LEVEL_PARSER, "html level parsing error (libxml2): %s", err_msg.c_str()); // check on error level. if (xep->level == 3) // fatal or recoverable error. { std::string msg = "libxml2 fatal error"; errlog::log_error(LOG_LEVEL_PARSER,msg.c_str()); if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); throw sp_exception(WB_ERR_PARSE,msg); } // XXX: too verbose, and confusing to users. else if (xep->level == 2) { std::string msg = "libxml2 recoverable error"; errlog::log_error(LOG_LEVEL_DEBUG,msg.c_str()); if (ctxt) xmlFreeParserCtxt(ctxt); //mutex_unlock(&se_parser::_se_parser_mutex); //throw sp_exception(WB_ERR_PARSE,msg); } } } }