// ------------------------------------------------------------- void Webpage::tidy_me() { try { TidyDoc _tdoc = tidyCreate(); //tidyOptSetBool(_tdoc, tidyOptGetIdForName("show-body-only"), (Bool)1); tidyOptSetBool(_tdoc, tidyOptGetIdForName("output-xhtml"), (Bool)1); tidyOptSetBool(_tdoc, tidyOptGetIdForName("quote-nbsp"), (Bool)0); tidyOptSetBool(_tdoc, tidyOptGetIdForName("show-warnings"), (Bool)0); tidyOptSetValue(_tdoc, tidyOptGetIdForName("char-encoding"), "utf8"); //tidyOptSetBool(_tdoc, tidyOptGetIdForName("ascii-chars"), (Bool)1); //tidyOptSetBool(_tdoc, tidyOptGetIdForName("markup"), (Bool)1); //tidyOptSetValue(_tdoc, tidyOptGetIdForName("indent"), "yes"); //tidyOptSetValue(_tdoc, tidyOptGetIdForName("newline"), "\n"); tidyOptSetInt(_tdoc, tidyOptGetIdForName("wrap"), 5000); tidyParseString( _tdoc, contents.c_str() ); /* // tidySaveBuffer doesn't seem to work with the makefile for some reason. TidyBuffer output = {0}; tidySaveBuffer(_tdoc, &output); cout << "3. TidyBuffer size: " << output.size << endl; contents = string((char*)output.bp, (size_t)output.size); */ // tidySaveString is a tricky beast. tmbstr buffer = NULL; uint buflen = 0; int status; do { status = tidySaveString( _tdoc, buffer, &buflen ); if (status == -ENOMEM) { if(buffer) free(buffer); buffer = (tmbstr)malloc(buflen + 1); } } while (status == -ENOMEM); contents = (char*)buffer; } catch (exception& e) { throw e.what(); } }
/*! * \fn static int TidyHtml(const char *pcSourcePage, string &sDestPage); * \brief 修补丢失、错误标签 * \param [in]待修补网页字符串 * \param [out]修补后的网页string * \return 结果码,==0修补正确,<0修补失败 * \date 2011-06-01 * \author nanjunxiao */ int Pretreat::TidyHtml(const char *pcSourcePage, std::string &sDestPage) { int iReturn = 0; TidyBuffer errbuf = {0}; TidyDoc tdoc; tmbstr pBuffer = NULL; try { if ( (pcSourcePage == NULL) || (strlen(pcSourcePage) ==0 ) ) { //cerr << "TidyHtml 输入页面为空!" << endl; throw (-1); } int iRet = -1; Bool bOk; uint uiBufLen; int iBufSize; tdoc = tidyCreate();// Initialize "document" bOk = tidyOptSetBool(tdoc, TidyXhtmlOut, yes);// Convert to XHTML if (bOk) { iRet = tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics } else { throw (-1); } if (iRet >= 0) { iRet = tidySetCharEncoding(tdoc,"utf8"); //Ensure dealing with gb2312 successfully } else { throw (-1); } if (iRet >= 0) { string htmlsrc = pcSourcePage; iRet = tidyParseString (tdoc, htmlsrc.c_str() ); // Parse the input } else { throw (-1); } if (iRet >= 0) { iRet = tidyCleanAndRepair(tdoc); //Tidy it up! } else { throw (-1); } if (iRet >= 0) { iRet = tidyRunDiagnostics(tdoc); //Kvetch } else { throw (-1); } if(iRet > 1) // If error, force output. { iRet = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? iRet : -1 ); } else if (iRet < 0) { throw (-1); } if (iRet >= 0) { // Pretty Print iBufSize = 1024 * 1024 * 5; uiBufLen = iBufSize; pBuffer = new char [iBufSize]; memset(pBuffer, '\0', iBufSize); iRet = tidySaveString(tdoc, pBuffer, &uiBufLen); } else { throw (-1); } if (iRet >= 0) { sDestPage = pBuffer; } else if (iRet == -ENOMEM) { //pBuffer 长度不够 //cerr << "TidyHtml pBuffer长度不够!" << endl; throw (-1); } else { throw (-1); } } catch(exception &err) { //cerr << "TidyHtml HtmlTidy修补页面失败! " << err.what() << endl; iReturn = -1; } catch(int iThrow) { if (iThrow < 0) { //cerr << "TidyHtml HtmlTidy修补页面失败!" << endl; } iReturn = iThrow; } catch(...) { //cerr << "TidyHtml HtmlTidy修补页面失败!" << endl; iReturn = -1; } tidyBufFree(&errbuf); tidyRelease(tdoc); if (pBuffer != NULL) { delete [] pBuffer; pBuffer = NULL; } return iReturn; }