QString tidyHtml(QString str, bool& ok) { #ifdef NO_TIDY ok = true; return str; #else QString res = str; ok = false; static bool isTidyWithIntBodyOnly = isTidyWithIntBodyOnlyCheck(); TidyDoc tdoc = tidyCreate(); TidyBuffer output; TidyBuffer errbuf; tidyBufInit(&output); tidyBufInit(&errbuf); bool configOk = tidyOptSetBool(tdoc, TidyXhtmlOut, yes) && tidyOptSetBool(tdoc, TidyForceOutput, yes) && tidyOptSetBool(tdoc, TidyMark, no) && (isTidyWithIntBodyOnly ? tidyOptSetInt(tdoc, TidyBodyOnly, 1) : tidyOptSetBool(tdoc, TidyBodyOnly, yes)) && tidyOptSetInt(tdoc, TidyWrapLen, 0) && tidyOptSetInt(tdoc, TidyDoctypeMode, TidyDoctypeOmit); if (configOk && (tidySetCharEncoding(tdoc, "utf8") >= 0) && (tidySetErrorBuffer(tdoc, &errbuf) >= 0) && (tidyParseString(tdoc, str.toUtf8().data()) >= 0) && (tidyCleanAndRepair(tdoc) >= 0) && (tidyRunDiagnostics(tdoc) >= 0) && (tidySaveBuffer(tdoc, &output) >= 0) && (output.bp != 0 && output.size > 0)) { res = QString::fromUtf8((char*)output.bp, output.size); ok = true; } #ifdef DEBUG_MARKUP if (errbuf.size > 0) { QString errStr = QString::fromUtf8((char*)errbuf.bp, errbuf.size); qDebug() << "\n[DEBUG] MARKUP, libtidy errors and warnings:\n" << errStr; } #endif if (output.bp != 0) tidyBufFree(&output); if (errbuf.bp != 0) tidyBufFree(&errbuf); tidyRelease(tdoc); return res.trimmed(); #endif }
void tidyhtml::parse(std::string x, std::string path) { tidySetErrorBuffer(tdoc, NULL); tidyParseString(tdoc, x.c_str()); tidyCleanAndRepair( tdoc ); tidyRunDiagnostics( tdoc ); tidySaveFile(tdoc, path.c_str()); }
int main(int argc, char **argv ) { CURL *curl; char curl_errbuf[CURL_ERROR_SIZE]; TidyDoc tdoc; TidyBuffer docbuf = {0}; TidyBuffer tidy_errbuf = {0}; int err; if ( argc == 2) { curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_URL, argv[1]); curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); tdoc = tidyCreate(); tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ tidyOptSetInt(tdoc, TidyWrapLen, 4096); tidySetErrorBuffer( tdoc, &tidy_errbuf ); tidyBufInit(&docbuf); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); err=curl_easy_perform(curl); if ( !err ) { err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ if ( err >= 0 ) { err = tidyCleanAndRepair(tdoc); /* fix any problems */ if ( err >= 0 ) { err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ if ( err >= 0 ) { dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */ fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ } } } } else fprintf(stderr, "%s\n", curl_errbuf); /* clean-up */ curl_easy_cleanup(curl); tidyBufFree(&docbuf); tidyBufFree(&tidy_errbuf); tidyRelease(tdoc); return(err); } else printf( "usage: %s <url>\n", argv[0] ); return(0); }
static html_valid_status_t html_valid_run (html_valid_t * htv, SV * html, SV ** output_ptr, SV ** errors_ptr) { const char * html_string; STRLEN html_length; SV * output; SV * errors; TidyBuffer tidy_output = {0}; TidyBuffer tidy_errbuf = {0}; /* First set these up sanely in case the stuff hits the fan. */ * output_ptr = & PL_sv_undef; * errors_ptr = & PL_sv_undef; /* Work around bug where allocator sometimes does not get set. */ CopyAllocator (htv->tdoc, & tidy_output); CopyAllocator (htv->tdoc, & tidy_errbuf); html_string = SvPV (html, html_length); CALL_TIDY (tidySetErrorBuffer (htv->tdoc, & tidy_errbuf)); htv->n_mallocs++; CALL_TIDY (tidyParseString (htv->tdoc, html_string)); CALL_TIDY (tidyCleanAndRepair (htv->tdoc)); CALL_TIDY (tidyRunDiagnostics (htv->tdoc)); CALL_TIDY (tidySaveBuffer (htv->tdoc, & tidy_output)); htv->n_mallocs++; /* Copy the contents of the buffers into the Perl scalars. */ output = newSVpv ((char *) tidy_output.bp, tidy_output.size); errors = newSVpv ((char *) tidy_errbuf.bp, tidy_errbuf.size); /* HTML Tidy randomly segfaults here due to "allocator" not being set in some cases, hence the above CopyAllocator fix. */ tidyBufFree (& tidy_output); htv->n_mallocs--; tidyBufFree (& tidy_errbuf); htv->n_mallocs--; /* These are not our mallocs, they are Perl's mallocs, so we don't increase htv->n_mallocs for these. After we return them, we no longer take care of these. */ * output_ptr = output; * errors_ptr = errors; return html_valid_ok; }
void TidyNetworkReply::tidyUp() { QUrl redirect = reply->attribute(QNetworkRequest::RedirectionTargetAttribute).toUrl(); if (redirect.isValid()) { redirect.setScheme("tidy"); setAttribute(QNetworkRequest::RedirectionTargetAttribute, QVariant(redirect)); emit finished(); reply->deleteLater(); return; } int rc = -1; Bool ok; ok = tidyOptSetBool( tdoc, TidyXmlOut, yes ); // Convert to XHTML if (ok) ok = tidyOptSetBool(tdoc, TidyQuoteNbsp, no); //if (ok) //ok = tidyOptSetValue(tdoc, TidyBlockTags, "header,nav,article,time,section,footer"); if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics if ( rc >= 0 ) rc = tidyParseString( tdoc, reply->readAll() ); // Parse the input if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc ); // Tidy it up! if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc ); // Kvetch if ( rc > 1 ) // If error, force output. rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if ( rc >= 0 ) rc = tidySaveBuffer( tdoc, &output ); // Pretty Print if ( rc >= 0 ) { if ( rc > 0 ) { ;//printf( "\nDiagnostics:\n\n%s", errbuf.bp ); } } else { ;//printf( "A severe error (%d) occurred.\n", rc ); } open(ReadOnly); emit readyRead(); emit finished(); reply->deleteLater(); //QTimer::singleShot(0, this, SIGNAL(readyRead())); //QTimer::singleShot(0, this, SIGNAL(finished())); }
void HTMLTidy::run() throw( std::runtime_error ) { TidyBuffer outputBuffer = { 0 }; TidyBuffer errorBuffer = { 0 }; // try to create valid XHTML document for XML parser: int tidyResult = -1; if( tidyOptSetBool( handle, TidyXhtmlOut, yes ) ) { tidyResult = tidySetErrorBuffer( handle, &errorBuffer ); } if( tidyResult >= 0 ) { tidyResult = tidyParseString( handle, document.c_str() ); } if( tidyResult >= 0 ) { tidyResult = tidyCleanAndRepair( handle ); } if( tidyResult >= 0 ) { tidyResult = tidyRunDiagnostics( handle ); } if( tidyResult > 1 ) { if( !tidyOptSetBool( handle, TidyForceOutput, yes ) ) { tidyResult = -1; } } if( tidyResult >= 0 ) { tidyResult = tidySaveBuffer( handle, &outputBuffer ); } if( tidyResult > 0 ) { std::clog << "*********************************" << std::endl; std::clog << "HTMLTidy: Diagnostics of libtidy:" << std::endl; std::clog << errorBuffer.bp; std::clog << "*********************************" << std::endl; } else if( tidyResult < 0 ) { std::stringstream sstrTidyResult; sstrTidyResult << tidyResult; throw std::runtime_error( "HTMLTidy: A severe error occured while tidying up the received document (" + sstrTidyResult.str() + ")." ); } resultDocument.reserve( outputBuffer.size ); // avoid frequent (re-)allocations for( unsigned int i = 0; i < outputBuffer.size; i++ ) { resultDocument.insert( resultDocument.end(), static_cast< char >( *(outputBuffer.bp + i) ) ); } tidyBufFree( &outputBuffer ); tidyBufFree( &errorBuffer ); }
HtmlTidy::HtmlTidy(const QString& html) : m_tidyDoc(tidyCreate()), m_errorOutput(), m_output(), m_input(html) { tidyOptSetBool (m_tidyDoc, TidyXmlOut, yes); tidyOptSetValue(m_tidyDoc, TidyCharEncoding, "utf8"); tidyOptSetInt (m_tidyDoc, TidyNewline, TidyLF); tidyOptSetBool (m_tidyDoc, TidyQuoteNbsp, no); tidyOptSetBool (m_tidyDoc, TidyForceOutput, yes); tidySetErrorBuffer(m_tidyDoc, &m_errorOutput); tidyParseString(m_tidyDoc, m_input.toUtf8().data()); tidyCleanAndRepair(m_tidyDoc); }
QString tidy(QString input) // take html code and return it converted to xhtml code { // the following code is (c) Charles Reitzel and Dave Raggett, see the package tidy TidyBuffer output = {0}; TidyBuffer errbuf = {0}; QString result; int rc = -1; Bool ok; TidyDoc tdoc = tidyCreate(); // Initialize "document" ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics tidySetCharEncoding( tdoc, "utf8" ); if ( rc >= 0 ) rc = tidyParseString( tdoc, input.toUtf8().constData() ); // Parse the input if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc ); // Tidy it up! if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc ); // Kvetch if ( rc > 1 ) // If error, force output. rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if ( rc >= 0 ) rc = tidySaveBuffer( tdoc, &output ); // Pretty Print if ( rc >= 0 ) { char* outputstring; // content of the outputfile // find out length of outputstring int length=0; // length of outputstring byte* string=output.bp; while (*string) { string++; length++; } outputstring=(char*)malloc(length); snprintf(outputstring,length,"%s",output.bp); result=QString::fromUtf8(outputstring,length); } else printf( "A severe error (\%d) occurred.\\n", rc ); tidyBufFree( &output ); tidyBufFree( &errbuf ); tidyRelease( tdoc ); result=result.replace("ö","ö"); return result; }
bool CCFHtmlTidy::TidyMain(const char* pSourceIn, const char* pOptions, std::string &strOut, std::string &strErr) { TidyBuffer output; TidyBuffer errbuf; int rc = -1; Bool ok = yes; TidyDoc tdoc = tidyCreate(); // Initialize "document" tidyBufInit(&output); tidyBufInit(&errbuf); TidyOptionsSet(tidyDocToImpl(tdoc), pOptions); if (ok) rc = tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics if (rc >= 0) rc = tidyParseString(tdoc, pSourceIn); // Parse the input if (rc >= 0) rc = tidyCleanAndRepair(tdoc); // Tidy it up! if (rc >= 0) rc = tidyRunDiagnostics(tdoc); // Kvetch //if ( rc > 1 ) // If error, force output. // rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if (rc >= 0) rc = tidySaveBuffer(tdoc, &output); // Pretty Print if (rc >= 0) { if (output.bp) { strOut = reinterpret_cast<char const*>(output.bp); } } strErr = reinterpret_cast<char const*>(errbuf.bp); std::string strEmpty = "No warnings or errors were found.\n\n"; if (0 == strEmpty.compare(strErr)) { strErr.clear(); } tidyBufFree(&output); tidyBufFree(&errbuf); tidyRelease(tdoc); return true; }
void tidy(std::string &input) { TidyBuffer output = {0}; TidyBuffer errbuf = {0}; TidyDoc tdoc = tidyCreate(); tidyOptSetBool(tdoc, TidyXhtmlOut, yes); tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics tidyParseString(tdoc, input.c_str()); tidyCleanAndRepair(tdoc); tidySaveBuffer(tdoc, &output); input = std::string((const char*)output.bp); tidyBufFree(&output); tidyBufFree(&errbuf); tidyRelease(tdoc); }
bool HTidyInterface::formatSource( const char* textIn, CString &strTidy, CString &strMsg ) { TidyBuffer output; TidyBuffer errbuf; int rc = -1; Bool ok = yes; TidyDoc tdoc = tidyCreate(); // Initialize "document" tidyBufInit(&output); tidyBufInit(&errbuf); InitTidyDefault(tdoc); SetTidyConfig(tdoc); if ( ok ) rc = tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics if ( rc >= 0 ) rc = tidyParseString(tdoc, textIn); // Parse the input if ( rc >= 0 ) rc = tidyCleanAndRepair(tdoc); // Tidy it up! if ( rc >= 0 ) rc = tidyRunDiagnostics(tdoc); // Kvetch //if ( rc > 1 ) // If error, force output. // rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if ( rc >= 0 ) rc = tidySaveBuffer(tdoc, &output); // Pretty Print if ( rc >= 0 ) { strTidy = reinterpret_cast< char const* >(output.bp); } strMsg = reinterpret_cast< char const* >(errbuf.bp); CString strEmpty = _T("No warnings or errors were found.\r\n\r\n"); if (0 == strEmpty.Compare(strMsg)) { strMsg.Empty(); } tidyBufFree(&output); tidyBufFree(&errbuf); tidyRelease(tdoc); return true; }
static zend_object *tidy_object_new(zend_class_entry *class_type, zend_object_handlers *handlers, tidy_obj_type objtype) { PHPTidyObj *intern; intern = ecalloc(1, sizeof(PHPTidyObj) + zend_object_properties_size(class_type)); zend_object_std_init(&intern->std, class_type); object_properties_init(&intern->std, class_type); switch(objtype) { case is_node: break; case is_doc: intern->ptdoc = emalloc(sizeof(PHPTidyDoc)); intern->ptdoc->doc = tidyCreate(); intern->ptdoc->ref_count = 1; intern->ptdoc->initialized = 0; intern->ptdoc->errbuf = emalloc(sizeof(TidyBuffer)); tidyBufInit(intern->ptdoc->errbuf); if (tidySetErrorBuffer(intern->ptdoc->doc, intern->ptdoc->errbuf) != 0) { tidyBufFree(intern->ptdoc->errbuf); efree(intern->ptdoc->errbuf); tidyRelease(intern->ptdoc->doc); efree(intern->ptdoc); efree(intern); php_error_docref(NULL, E_ERROR, "Could not set Tidy error buffer"); } tidyOptSetBool(intern->ptdoc->doc, TidyForceOutput, yes); tidyOptSetBool(intern->ptdoc->doc, TidyMark, no); TIDY_SET_DEFAULT_CONFIG(intern->ptdoc->doc); tidy_add_default_properties(intern, is_doc); break; } intern->std.handlers = handlers; return &intern->std; }
static int php_tidy_output_handler(void **nothing, php_output_context *output_context) { int status = FAILURE; TidyDoc doc; TidyBuffer inbuf, outbuf, errbuf; if (TG(clean_output) && (output_context->op & PHP_OUTPUT_HANDLER_START) && (output_context->op & PHP_OUTPUT_HANDLER_FINAL)) { doc = tidyCreate(); tidyBufInit(&errbuf); if (0 == tidySetErrorBuffer(doc, &errbuf)) { tidyOptSetBool(doc, TidyForceOutput, yes); tidyOptSetBool(doc, TidyMark, no); if (ZEND_SIZE_T_UINT_OVFL(output_context->in.used)) { php_error_docref(NULL, E_WARNING, "Input string is too long"); return status; } TIDY_SET_DEFAULT_CONFIG(doc); tidyBufInit(&inbuf); tidyBufAttach(&inbuf, (byte *) output_context->in.data, (uint)output_context->in.used); if (0 <= tidyParseBuffer(doc, &inbuf) && 0 <= tidyCleanAndRepair(doc)) { tidyBufInit(&outbuf); tidySaveBuffer(doc, &outbuf); FIX_BUFFER(&outbuf); output_context->out.data = (char *) outbuf.bp; output_context->out.used = outbuf.size ? outbuf.size-1 : 0; output_context->out.free = 1; status = SUCCESS; } } tidyRelease(doc); tidyBufFree(&errbuf); } return status; }
int main(int argc, char **argv ) { const char* input = "<title>Hello</title><p>World!"; TidyBuffer output = {0}; TidyBuffer errbuf = {0}; int rc = -1; Bool ok; // Initialize "document" TidyDoc tdoc = tidyCreate(); printf( "Tidying:\t%s\n", input ); // Convert to XHTML ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics if ( rc >= 0 ) rc = tidyParseString( tdoc, input ); // Parse the input if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc ); // Tidy it up! if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc ); // Kvetch if ( rc > 1 ) // If error, force output. rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if ( rc >= 0 ) rc = tidySaveBuffer( tdoc, &output ); // Pretty Print if ( rc >= 0 ) { if ( rc > 0 ) printf( "\nDiagnostics:\n\n%s", errbuf.bp ); printf( "\nAnd here is the result:\n\n%s", output.bp ); } else printf( "A severe error (%d) occurred.\n", rc ); tidyBufFree( &output ); tidyBufFree( &errbuf ); tidyRelease( tdoc ); return rc; }
// quick and dirty shortcut function. int lua_tidy_easyClean ( lua_State *L ) { TidyBuffer output; TidyBuffer errbuf; int rc; pTidy t; const char * input; tidyBufInit(&output); tidyBufInit(&errbuf); rc = -1; t = toTidy(L,1); input = lua_tostring(L,2); rc = tidySetErrorBuffer( t->tdoc, &errbuf ); if ( rc >= 0 ) rc = tidyParseString( t->tdoc, input ); if ( rc >= 0 ) rc = tidyCleanAndRepair( t->tdoc ); if ( rc >= 0 ) rc = tidyRunDiagnostics( t->tdoc ); if ( rc >= 0 ) rc = tidySaveBuffer( t->tdoc, &output ); lua_pushlstring(L, (char*)output.bp,output.size); if ( rc != 0 ) lua_pushlstring(L, (char*)errbuf.bp,errbuf.size); else lua_pushnil(L); lua_pushnumber(L, rc); tidyBufFree( &output ); tidyBufFree( &errbuf ); return 3; }
void html_parse(const gchar* html, GSList** objs) { TidyDoc tdoc = tidyCreate(); TidyBuffer tidy_errbuf = {0}; int err = 0; tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ tidyOptSetInt(tdoc, TidyWrapLen, 4096); tidySetErrorBuffer( tdoc, &tidy_errbuf ); err = tidyParseString(tdoc, html); /* parse the input */ if ( err >= 0 ) { err = tidyCleanAndRepair(tdoc); /* fix any problems */ if ( err >= 0 ) { err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ if ( err >= 0 ) { html_find_objects(tidyGetHtml(tdoc), objs); /* walk the tree */ } } } }
static void php_tidy_quick_repair(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_file) { char *enc = NULL; size_t enc_len = 0; zend_bool use_include_path = 0; TidyDoc doc; TidyBuffer *errbuf; zend_string *data, *arg1; zval *config = NULL; if (is_file) { if (zend_parse_parameters(ZEND_NUM_ARGS(), "P|zsb", &arg1, &config, &enc, &enc_len, &use_include_path) == FAILURE) { RETURN_FALSE; } if (!(data = php_tidy_file_to_mem(ZSTR_VAL(arg1), use_include_path))) { RETURN_FALSE; } } else { if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|zsb", &arg1, &config, &enc, &enc_len, &use_include_path) == FAILURE) { RETURN_FALSE; } data = arg1; } if (ZEND_SIZE_T_UINT_OVFL(ZSTR_LEN(data))) { php_error_docref(NULL, E_WARNING, "Input string is too long"); RETURN_FALSE; } doc = tidyCreate(); errbuf = emalloc(sizeof(TidyBuffer)); tidyBufInit(errbuf); if (tidySetErrorBuffer(doc, errbuf) != 0) { tidyBufFree(errbuf); efree(errbuf); tidyRelease(doc); php_error_docref(NULL, E_ERROR, "Could not set Tidy error buffer"); } tidyOptSetBool(doc, TidyForceOutput, yes); tidyOptSetBool(doc, TidyMark, no); TIDY_SET_DEFAULT_CONFIG(doc); if (config) { TIDY_APPLY_CONFIG_ZVAL(doc, config); } if(enc_len) { if (tidySetCharEncoding(doc, enc) < 0) { php_error_docref(NULL, E_WARNING, "Could not set encoding '%s'", enc); RETVAL_FALSE; } } if (data) { TidyBuffer buf; tidyBufInit(&buf); tidyBufAttach(&buf, (byte *) ZSTR_VAL(data), (uint)ZSTR_LEN(data)); if (tidyParseBuffer(doc, &buf) < 0) { php_error_docref(NULL, E_WARNING, "%s", errbuf->bp); RETVAL_FALSE; } else { if (tidyCleanAndRepair(doc) >= 0) { TidyBuffer output; tidyBufInit(&output); tidySaveBuffer (doc, &output); FIX_BUFFER(&output); RETVAL_STRINGL((char *) output.bp, output.size ? output.size-1 : 0); tidyBufFree(&output); } else { RETVAL_FALSE; } } } if (is_file) { zend_string_release(data); } tidyBufFree(errbuf); efree(errbuf); tidyRelease(doc); }
/*! * \fn static int TidyHtml(const char *pcSourcePage, string &sDestPage); * \brief 修补丢失、错误标签 * \param [in]待修补网页字符串 * \param [out]修补后的网页string * \return 结果码,==0修补正确,<0修补失败 * \date 2011-06-01 * \author nanjunxiao */ int Pretreat::TidyHtml(const char *pcSourcePage, std::string &sDestPage) { int iReturn = 0; TidyBuffer errbuf = {0}; TidyDoc tdoc; tmbstr pBuffer = NULL; try { if ( (pcSourcePage == NULL) || (strlen(pcSourcePage) ==0 ) ) { //cerr << "TidyHtml 输入页面为空!" << endl; throw (-1); } int iRet = -1; Bool bOk; uint uiBufLen; int iBufSize; tdoc = tidyCreate();// Initialize "document" bOk = tidyOptSetBool(tdoc, TidyXhtmlOut, yes);// Convert to XHTML if (bOk) { iRet = tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics } else { throw (-1); } if (iRet >= 0) { iRet = tidySetCharEncoding(tdoc,"utf8"); //Ensure dealing with gb2312 successfully } else { throw (-1); } if (iRet >= 0) { string htmlsrc = pcSourcePage; iRet = tidyParseString (tdoc, htmlsrc.c_str() ); // Parse the input } else { throw (-1); } if (iRet >= 0) { iRet = tidyCleanAndRepair(tdoc); //Tidy it up! } else { throw (-1); } if (iRet >= 0) { iRet = tidyRunDiagnostics(tdoc); //Kvetch } else { throw (-1); } if(iRet > 1) // If error, force output. { iRet = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? iRet : -1 ); } else if (iRet < 0) { throw (-1); } if (iRet >= 0) { // Pretty Print iBufSize = 1024 * 1024 * 5; uiBufLen = iBufSize; pBuffer = new char [iBufSize]; memset(pBuffer, '\0', iBufSize); iRet = tidySaveString(tdoc, pBuffer, &uiBufLen); } else { throw (-1); } if (iRet >= 0) { sDestPage = pBuffer; } else if (iRet == -ENOMEM) { //pBuffer 长度不够 //cerr << "TidyHtml pBuffer长度不够!" << endl; throw (-1); } else { throw (-1); } } catch(exception &err) { //cerr << "TidyHtml HtmlTidy修补页面失败! " << err.what() << endl; iReturn = -1; } catch(int iThrow) { if (iThrow < 0) { //cerr << "TidyHtml HtmlTidy修补页面失败!" << endl; } iReturn = iThrow; } catch(...) { //cerr << "TidyHtml HtmlTidy修补页面失败!" << endl; iReturn = -1; } tidyBufFree(&errbuf); tidyRelease(tdoc); if (pBuffer != NULL) { delete [] pBuffer; pBuffer = NULL; } return iReturn; }
bool TidyReader::openFile (const char * szFilename) { UT_DEBUGMSG(("using libtidy to parse HTML...\n")); m_tidy = tidyCreate (); if (m_tidy == 0) return false; if (tidyOptSetBool (m_tidy, TidyXhtmlOut, yes) == 0) { UT_DEBUGMSG(("tidyOptSetBool failed!\n")); closeFile (); return false; } #ifndef DEBUG tidySetErrorBuffer (m_tidy, &m_errbuf); #endif int parse_status; if (m_buffer && m_length) { UT_DEBUGMSG(("parse HTML in buffer...\n")); UT_Byte * buffer = const_cast<UT_Byte *>(m_buffer); // grr. TidyBuffer inbuf; tidyBufInit (&inbuf); tidyBufAttach (&inbuf, buffer, static_cast<unsigned int>(m_length)); parse_status = tidyParseBuffer (m_tidy, &inbuf); tidyBufDetach (&inbuf); } else { UT_DEBUGMSG(("parse HTML in file: %s\n",szFilename)); parse_status = tidyParseFile (m_tidy, szFilename); } if (parse_status < 0) { UT_DEBUGMSG(("tidyParseBuffer/File failed!\n")); closeFile (); return false; } parse_status = tidyCleanAndRepair (m_tidy); if (parse_status < 0) { UT_DEBUGMSG(("tidyCleanAndRepair failed!\n")); closeFile (); return false; } parse_status = tidyRunDiagnostics (m_tidy); if (parse_status < 0) { UT_DEBUGMSG(("tidyRunDiagnostics failed!\n")); closeFile (); return false; } if (parse_status > 1) { parse_status = (tidyOptSetBool (m_tidy, TidyForceOutput, yes) ? parse_status : -1); } if (parse_status < 0) { UT_DEBUGMSG(("tidyOptSetBool failed!\n")); closeFile (); return false; } parse_status = tidySaveBuffer (m_tidy, &m_outbuf); if (parse_status < 0) { UT_DEBUGMSG(("tidySaveBuffer failed!\n")); closeFile (); return false; } UT_DEBUGMSG(("tidy succeeded!\n")); #ifdef DEBUG fputs ("================================================================\n", stderr); fputs ((const char *) m_outbuf.bp, stderr); fputs ("================================================================\n", stderr); #endif m_outbuf.next = 0; return true; }
static PyObject* elementtidy_fixup(PyObject* self, PyObject* args) { int rc; TidyDoc doc; TidyBuffer out = {0}; TidyBuffer err = {0}; PyObject* pyout; PyObject* pyerr; char* text; char* encoding = NULL; if (!PyArg_ParseTuple(args, "s|s:fixup", &text, &encoding)) return NULL; doc = tidyCreate(); /* options for nice XHTML output */ if (encoding) /* if an encoding is given, use it for both input and output */ tidyOptSetValue(doc, TidyCharEncoding, encoding); else /* if no encoding is given, use default input and utf-8 output */ tidyOptSetValue(doc, TidyOutCharEncoding, "utf8"); tidyOptSetBool(doc, TidyForceOutput, yes); tidyOptSetInt(doc, TidyWrapLen, 0); tidyOptSetBool(doc, TidyQuiet, yes); tidyOptSetBool(doc, TidyXhtmlOut, yes); tidyOptSetBool(doc, TidyXmlDecl, yes); tidyOptSetInt(doc, TidyIndentContent, 0); tidyOptSetBool(doc, TidyNumEntities, yes); rc = tidySetErrorBuffer(doc, &err); if (rc < 0) { PyErr_SetString(PyExc_IOError, "tidySetErrorBuffer failed"); goto error; } rc = tidyParseString(doc, text); if (rc < 0) { PyErr_SetString(PyExc_IOError, "tidyParseString failed"); goto error; } rc = tidyCleanAndRepair(doc); if (rc < 0) { PyErr_SetString(PyExc_IOError, "tidyCleanAndRepair failed"); goto error; } rc = tidyRunDiagnostics(doc); if (rc < 0) { PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed"); goto error; } rc = tidySaveBuffer(doc, &out); if (rc < 0) { PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed"); goto error; } pyout = PyString_FromString(out.bp ? out.bp : ""); if (!pyout) goto error; pyerr = PyString_FromString(err.bp ? err.bp : ""); if (!pyerr) { Py_DECREF(pyout); goto error; } tidyBufFree(&out); tidyBufFree(&err); tidyRelease(doc); return Py_BuildValue("NN", pyout, pyerr); error: tidyBufFree(&out); tidyBufFree(&err); tidyRelease(doc); return NULL; }
int CProxyParse::RunFromMem( wxString content ) { char *pBuffer; //http://www.51proxied.com/http_non_anonymous.html //wxString path = wxT("f:/work/windows/wxUrlRefresh/data/最新透明HTTP代理服务器.htm"); //wxString path1 = wxT("f:/work/windows/wxUrlRefresh/data/result.xml"); wxString data_path = wxGetCwd() + "/data/"; wxString path1 = data_path + "_tmp.xml"; if (!wxDirExists(data_path)) wxMkdir(data_path); pBuffer = (char*)calloc(content.Length()+1, 1); wxStrncpy(pBuffer, content, content.Len()+1); wxLogMessage("Run Tidy!"); TidyBuffer output; TidyBuffer errbuf; int rc = -1; Bool ok; TidyDoc tdoc = tidyCreate(); // Initialize "document" tidyBufInit( &output ); tidyBufInit( &errbuf ); //printf( "Tidying:\t\%s\\n", input ); tidySetCharEncoding(tdoc, "utf8"); ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics if ( rc >= 0 ) rc = tidyParseString( tdoc, pBuffer ); // Parse the input if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc ); // Tidy it up! if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc ); // Kvetch if ( rc > 1 ) // If error, force output. rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if ( rc >= 0 ) rc = tidySaveBuffer( tdoc, &output ); // Pretty Print if ( rc >= 0 ) { #ifdef _DEBUG //if ( rc > 0 ) // WriteAllToFile("f:/work/windows/wxUrlRefresh/data/error.xml", (char*)errbuf.bp, errbuf.size); WriteAllToFile(path1, (char*)output.bp, output.size); #endif } else wxLogError("tidyFail"); tidyBufFree( &output ); tidyBufFree( &errbuf ); tidyRelease( tdoc ); if (pBuffer) free(pBuffer); wxLogMessage("Fetch data!"); // 解析数据 TiXmlDocument doc(path1); if (doc.LoadFile()) { // root CTiXmlProxyVistor vistor(&m_array); TiXmlElement *pRoot = doc.RootElement(); pRoot->Accept(&vistor); } else { wxLogMessage("shit"); return -2; } return 0; }
int main(int argc, char **argv ) { CURL *curl; char curl_errbuf[CURL_ERROR_SIZE]; char url[URL_BUF_SIZE]; char *username; TidyDoc tdoc; TidyBuffer docbuf = {0}; TidyBuffer tidy_errbuf = {0}; int err; if ( argc == 2) { username = argv[1]; } else { username = "******"; } WeatherData data; snprintf(url, URL_BUF_SIZE, "http://www.weatherlink.com/user/%s/index.php?view=summary&headers=0&type=2", username); curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); curl_easy_setopt(curl, CURLOPT_VERBOSE, 0L); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); tdoc = tidyCreate(); tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ tidyOptSetInt(tdoc, TidyWrapLen, 4096); tidySetErrorBuffer( tdoc, &tidy_errbuf ); tidyBufInit(&docbuf); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); err=curl_easy_perform(curl); if ( !err ) { err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ if ( err >= 0 ) { err = tidyCleanAndRepair(tdoc); /* fix any problems */ if ( err >= 0 ) { dumpNode( tdoc, tidyGetRoot(tdoc), 0, &data ); /* walk the tree */ //err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ //if ( err >= 0 ) //{ //dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */ // fprintf(stderr, ">> %s\n", tidy_errbuf.bp); /* show errors */ //} } } } else { fprintf(stderr, "%s\n", curl_errbuf); } printf("Outside temp: %f\n", data.outsideTemp ); printf("Outside humidity: %d\n", data.outsideHumidity ); printf("Dew Point: %f\n", data.dewPoint ); printf("Barometer: %f\n", data.barometer ); printf("Wind speed: %f\n", data.instantWindSpeed ); printf("Wind direction: %d\n", data.instantWindDirection ); printf("Average Wind: %f\n", data.avgWindSpeed_2min ); printf("Wind Gust: %f\n", data.windGust_10min); printf("rainRate: %f\n", data.rainRate ); printf("dailyRain: %f\n", data.dailyRain ); printf("lastHourRain: %f\n", data.lastHourRain ); /* clean-up */ curl_easy_cleanup(curl); tidyBufFree(&docbuf); //tidyBufFree(&tidy_errbuf); tidyRelease(tdoc); return(err); return(0); }
static PyObject *parseString(PyObject *self, PyObject *args) { char *cp; int i, len, list_size; TidyDoc tdoc; TidyOption option = TidyUnknownOption; PyObject *res = NULL, *arglist = NULL; PyObject *key_list = NULL, *item = NULL, *value = NULL; TidyBuffer output = {0}; TidyBuffer errbuf = {0}; if (!PyArg_ParseTuple(args, "s#|O", &cp, &len, &arglist)) return NULL; if (arglist && !PyDict_Check(arglist)) { PyErr_SetString(PyExc_TypeError, "Second argument must be a dictionary!"); return NULL; } tdoc = tidyCreate(); tidySetErrorBuffer(tdoc, &errbuf); if (!arglist) goto im_so_lazy; /* no args provided */ key_list = PyDict_Keys(arglist); list_size = PyList_Size(key_list); for (i = 0; i < list_size; i++) { item = PyList_GetItem(key_list, i); value = PyDict_GetItem(arglist, item); Py_INCREF(item); Py_INCREF(value); option = tidyGetOptionByName(tdoc, PyString_AsString(item)); if (option == TidyUnknownOption) { PyErr_Format(PyExc_KeyError, "Unknown tidy option '%s'", PyString_AsString(item)); TDOC_RETURN(); } switch (tidyOptGetType(option)) { case TidyString: PY_TO_TIDY(String_Check, Value, String_AsString, "a String"); break; case TidyInteger: PY_TO_TIDY(Int_Check, Int, Int_AsLong, "an Integer"); break; case TidyBoolean: PY_TO_TIDY(Int_Check, Bool, Int_AsLong, "a Boolean or an Integer"); break; default: { PyErr_Format(PyExc_RuntimeError, "Something strange happened, there is no option type %d", tidyOptGetType(option)); TDOC_RETURN(); } } Py_DECREF(item); Py_DECREF(value); } im_so_lazy: tidyParseString(tdoc, cp); tidyCleanAndRepair(tdoc); tidySaveBuffer(tdoc, &output); res = Py_BuildValue("s#", output.bp, output.size); tidyBufFree(&output); tidyBufFree(&errbuf); tidyRelease(tdoc); return res; }