Пример #1
0
// -------------------------------------------------------------
void Webpage::tidy_me()
{
	try {
		TidyDoc _tdoc = tidyCreate();
		//tidyOptSetBool(_tdoc, tidyOptGetIdForName("show-body-only"), (Bool)1);
		tidyOptSetBool(_tdoc, tidyOptGetIdForName("output-xhtml"), (Bool)1);
		tidyOptSetBool(_tdoc, tidyOptGetIdForName("quote-nbsp"), (Bool)0);
		tidyOptSetBool(_tdoc, tidyOptGetIdForName("show-warnings"), (Bool)0);
		tidyOptSetValue(_tdoc, tidyOptGetIdForName("char-encoding"), "utf8");
		//tidyOptSetBool(_tdoc, tidyOptGetIdForName("ascii-chars"), (Bool)1);
		//tidyOptSetBool(_tdoc, tidyOptGetIdForName("markup"), (Bool)1);
		//tidyOptSetValue(_tdoc, tidyOptGetIdForName("indent"), "yes");
		//tidyOptSetValue(_tdoc, tidyOptGetIdForName("newline"), "\n");
		tidyOptSetInt(_tdoc, tidyOptGetIdForName("wrap"), 5000);
		tidyParseString( _tdoc, contents.c_str() );
	
		/*
		// tidySaveBuffer doesn't seem to work with the makefile for some reason.
		TidyBuffer output = {0};
		tidySaveBuffer(_tdoc, &output);
		cout << "3. TidyBuffer size: " << output.size << endl;
		contents = string((char*)output.bp, (size_t)output.size);
		 */
		
		// tidySaveString is a tricky beast.
		tmbstr buffer = NULL;
		uint buflen = 0;
		int status;
		do {
			status = tidySaveString( _tdoc, buffer, &buflen );
			if (status == -ENOMEM) {
				if(buffer) 
					free(buffer);
				buffer = (tmbstr)malloc(buflen + 1);
			}
		} while (status == -ENOMEM);
		contents = (char*)buffer;

	} catch (exception& e) {
		throw e.what();
	}
}
Пример #2
0
	/*!
	* \fn static int TidyHtml(const char *pcSourcePage, string &sDestPage);
	* \brief  修补丢失、错误标签
	* \param  [in]待修补网页字符串
	* \param  [out]修补后的网页string
	* \return 结果码,==0修补正确,<0修补失败
	* \date   2011-06-01 
	* \author nanjunxiao
	*/
	int Pretreat::TidyHtml(const char *pcSourcePage, std::string &sDestPage)
	{
		int iReturn = 0;
		TidyBuffer errbuf = {0};
		TidyDoc tdoc;
		tmbstr pBuffer = NULL;

		try
		{
			if ( (pcSourcePage == NULL) || (strlen(pcSourcePage) ==0 ) )
			{
				//cerr << "TidyHtml 输入页面为空!" << endl;
				throw (-1);
			}

			int iRet = -1;
			Bool bOk;
			uint uiBufLen;
			int iBufSize;
			tdoc = tidyCreate();// Initialize "document"
			bOk = tidyOptSetBool(tdoc, TidyXhtmlOut, yes);// Convert to XHTML
			if (bOk)
			{
				iRet = tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				iRet = tidySetCharEncoding(tdoc,"utf8"); //Ensure dealing with gb2312 successfully
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				string htmlsrc = pcSourcePage;
				iRet = tidyParseString (tdoc, htmlsrc.c_str() ); // Parse the input
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				iRet = tidyCleanAndRepair(tdoc); //Tidy it up!
			}
			else
			{
				throw (-1);
			}
			
			if (iRet >= 0)
			{
				iRet = tidyRunDiagnostics(tdoc); //Kvetch
			}
			else
			{
				throw (-1);
			}

			if(iRet > 1) // If error, force output.
			{
				iRet = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? iRet : -1 );
			}
			else if (iRet < 0)
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				// Pretty Print
				iBufSize = 1024 * 1024 * 5;
				uiBufLen = iBufSize;
				pBuffer = new char [iBufSize];
				memset(pBuffer, '\0', iBufSize);
				iRet = tidySaveString(tdoc, pBuffer, &uiBufLen);
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				sDestPage = pBuffer;
			}
			else if (iRet == -ENOMEM)
			{
				//pBuffer 长度不够
				//cerr << "TidyHtml pBuffer长度不够!" << endl;
				throw (-1);
			}
			else
			{
				throw (-1);
			}
		}
		catch(exception &err)
		{
			//cerr << "TidyHtml HtmlTidy修补页面失败! " << err.what() << endl;
			iReturn = -1;
		}
		catch(int iThrow)
		{
			if (iThrow < 0)
			{
				//cerr << "TidyHtml HtmlTidy修补页面失败!" << endl;
			}
			iReturn = iThrow;
		}
		catch(...)
		{
			//cerr << "TidyHtml HtmlTidy修补页面失败!" << endl;
			iReturn = -1;
		}

		tidyBufFree(&errbuf);
		tidyRelease(tdoc);
		if (pBuffer != NULL)
		{
			delete [] pBuffer;
			pBuffer = NULL;
		}
		return iReturn;
	}