Пример #1
0
QString tidyHtml(QString str, bool& ok) {
#ifdef NO_TIDY
  ok = true;
  return str;
#else
  QString res = str;
  ok = false;

  static bool isTidyWithIntBodyOnly = isTidyWithIntBodyOnlyCheck();
  
  TidyDoc tdoc = tidyCreate();
  TidyBuffer output;
  TidyBuffer errbuf;

  tidyBufInit(&output);
  tidyBufInit(&errbuf);

  bool configOk = 
    tidyOptSetBool(tdoc, TidyXhtmlOut, yes) && 
    tidyOptSetBool(tdoc, TidyForceOutput, yes) &&
    tidyOptSetBool(tdoc, TidyMark, no) &&
    (isTidyWithIntBodyOnly
     ? tidyOptSetInt(tdoc, TidyBodyOnly, 1)
     : tidyOptSetBool(tdoc, TidyBodyOnly, yes)) &&
    tidyOptSetInt(tdoc, TidyWrapLen, 0) &&
    tidyOptSetInt(tdoc, TidyDoctypeMode, TidyDoctypeOmit);
    
  if (configOk &&
      (tidySetCharEncoding(tdoc, "utf8") >= 0) &&
      (tidySetErrorBuffer(tdoc, &errbuf) >= 0) &&
      (tidyParseString(tdoc, str.toUtf8().data()) >= 0) &&
      (tidyCleanAndRepair(tdoc) >= 0) &&
      (tidyRunDiagnostics(tdoc) >= 0) &&
      (tidySaveBuffer(tdoc, &output) >= 0) &&
      (output.bp != 0 && output.size > 0)) {
    res = QString::fromUtf8((char*)output.bp, output.size);

    ok = true;
  }

#ifdef DEBUG_MARKUP
  if (errbuf.size > 0) {
    QString errStr =  QString::fromUtf8((char*)errbuf.bp, errbuf.size);
    qDebug() << "\n[DEBUG] MARKUP, libtidy errors and warnings:\n" << errStr;
  }
#endif

  if (output.bp != 0)
    tidyBufFree(&output);
  if (errbuf.bp != 0)
    tidyBufFree(&errbuf);
  tidyRelease(tdoc);

  return res.trimmed();
#endif
}
Пример #2
0
void tidyhtml::parse(std::string x, std::string path)
{
        tidySetErrorBuffer(tdoc, NULL);
        tidyParseString(tdoc, x.c_str());
        tidyCleanAndRepair( tdoc );
        tidyRunDiagnostics( tdoc );
        tidySaveFile(tdoc, path.c_str());


}
Пример #3
0
/* {{{ proto boolean tidy_diagnose()
   Run configured diagnostics on parsed and repaired markup. */
static PHP_FUNCTION(tidy_diagnose)
{
	TIDY_FETCH_OBJECT;

	if (obj->ptdoc->initialized && tidyRunDiagnostics(obj->ptdoc->doc) >= 0) {
		tidy_doc_update_properties(obj);
		RETURN_TRUE;
	}

	RETURN_FALSE;
}
Пример #4
0
int main(int argc, char **argv )
{
  CURL *curl;
  char curl_errbuf[CURL_ERROR_SIZE];
  TidyDoc tdoc;
  TidyBuffer docbuf = {0};
  TidyBuffer tidy_errbuf = {0};
  int err;
  if ( argc == 2) {
    curl = curl_easy_init();
    curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
    curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
    curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);

    tdoc = tidyCreate();
    tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
    tidyOptSetInt(tdoc, TidyWrapLen, 4096);
    tidySetErrorBuffer( tdoc, &tidy_errbuf );
    tidyBufInit(&docbuf);

    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
    err=curl_easy_perform(curl);
    if ( !err ) {
      err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
      if ( err >= 0 ) {
        err = tidyCleanAndRepair(tdoc); /* fix any problems */
        if ( err >= 0 ) {
          err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
          if ( err >= 0 ) {
            dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */
            fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
          }
        }
      }
    }
    else
      fprintf(stderr, "%s\n", curl_errbuf);

    /* clean-up */
    curl_easy_cleanup(curl);
    tidyBufFree(&docbuf);
    tidyBufFree(&tidy_errbuf);
    tidyRelease(tdoc);
    return(err);

  }
  else
    printf( "usage: %s <url>\n", argv[0] );

  return(0);
}
Пример #5
0
static html_valid_status_t
html_valid_run (html_valid_t * htv, SV * html,
		SV ** output_ptr, SV ** errors_ptr)
{
    const char * html_string;
    STRLEN html_length;
    SV * output;
    SV * errors;

    TidyBuffer tidy_output = {0};
    TidyBuffer tidy_errbuf = {0};

    /* First set these up sanely in case the stuff hits the fan. */

    * output_ptr = & PL_sv_undef;
    * errors_ptr = & PL_sv_undef;

    /* Work around bug where allocator sometimes does not get set. */

    CopyAllocator (htv->tdoc, & tidy_output);
    CopyAllocator (htv->tdoc, & tidy_errbuf);

    html_string = SvPV (html, html_length);
    CALL_TIDY (tidySetErrorBuffer (htv->tdoc, & tidy_errbuf));
    htv->n_mallocs++;
    CALL_TIDY (tidyParseString (htv->tdoc, html_string));
    CALL_TIDY (tidyCleanAndRepair (htv->tdoc));
    CALL_TIDY (tidyRunDiagnostics (htv->tdoc));
    CALL_TIDY (tidySaveBuffer (htv->tdoc, & tidy_output));
    htv->n_mallocs++;

    /* Copy the contents of the buffers into the Perl scalars. */

    output = newSVpv ((char *) tidy_output.bp, tidy_output.size);
    errors = newSVpv ((char *) tidy_errbuf.bp, tidy_errbuf.size);

    /* HTML Tidy randomly segfaults here due to "allocator" not being
       set in some cases, hence the above CopyAllocator fix. */

    tidyBufFree (& tidy_output);
    htv->n_mallocs--;
    tidyBufFree (& tidy_errbuf);
    htv->n_mallocs--;

    /* These are not our mallocs, they are Perl's mallocs, so we don't
       increase htv->n_mallocs for these. After we return them, we no
       longer take care of these. */
    * output_ptr = output;
    * errors_ptr = errors;
    return html_valid_ok;
}
void TidyNetworkReply::tidyUp() {
    QUrl redirect = reply->attribute(QNetworkRequest::RedirectionTargetAttribute).toUrl();
    if (redirect.isValid()) {
        redirect.setScheme("tidy");
        setAttribute(QNetworkRequest::RedirectionTargetAttribute, QVariant(redirect));

        emit finished();
        reply->deleteLater();
        return;
    }

    int rc = -1;
    Bool ok;

    ok = tidyOptSetBool( tdoc, TidyXmlOut, yes );  // Convert to XHTML
    if (ok)
        ok = tidyOptSetBool(tdoc, TidyQuoteNbsp, no);
    //if (ok)
    //ok = tidyOptSetValue(tdoc, TidyBlockTags, "header,nav,article,time,section,footer");
    if ( ok )
        rc = tidySetErrorBuffer( tdoc, &errbuf );      // Capture diagnostics
    if ( rc >= 0 )
        rc = tidyParseString( tdoc, reply->readAll() );           // Parse the input
    if ( rc >= 0 )
        rc = tidyCleanAndRepair( tdoc );               // Tidy it up!
    if ( rc >= 0 )
        rc = tidyRunDiagnostics( tdoc );               // Kvetch
    if ( rc > 1 )                                    // If error, force output.
        rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
    if ( rc >= 0 )
        rc = tidySaveBuffer( tdoc, &output );          // Pretty Print

    if ( rc >= 0 ) {
        if ( rc > 0 ) {
            ;//printf( "\nDiagnostics:\n\n%s", errbuf.bp );
        }
    } else {
        ;//printf( "A severe error (%d) occurred.\n", rc );
    }

    open(ReadOnly);
    emit readyRead();
    emit finished();

    reply->deleteLater();
    //QTimer::singleShot(0, this, SIGNAL(readyRead()));
    //QTimer::singleShot(0, this, SIGNAL(finished()));
}
Пример #7
0
void HTMLTidy::run() throw( std::runtime_error ) {
	TidyBuffer outputBuffer = { 0 };
	TidyBuffer errorBuffer = { 0 };
	// try to create valid XHTML document for XML parser:
	int tidyResult = -1;
	if( tidyOptSetBool( handle, TidyXhtmlOut, yes ) ) {
		tidyResult = tidySetErrorBuffer( handle, &errorBuffer );
	}
	if( tidyResult >= 0 ) {
		tidyResult = tidyParseString( handle, document.c_str() );
	}
	if( tidyResult >= 0 ) {
		tidyResult = tidyCleanAndRepair( handle );
	}
	if( tidyResult >= 0 ) {
		tidyResult = tidyRunDiagnostics( handle );
	}
	if( tidyResult > 1 ) {
		if( !tidyOptSetBool( handle, TidyForceOutput, yes ) ) {
			tidyResult = -1;
		}
	}
	if( tidyResult >= 0 ) {
		tidyResult = tidySaveBuffer( handle, &outputBuffer );
	}
	if( tidyResult > 0 ) {
		std::clog << "*********************************" << std::endl;
		std::clog << "HTMLTidy: Diagnostics of libtidy:" << std::endl;
		std::clog << errorBuffer.bp;
		std::clog << "*********************************" << std::endl;
	}
	else if( tidyResult < 0 ) {
		std::stringstream sstrTidyResult;
		sstrTidyResult << tidyResult;
		throw std::runtime_error( "HTMLTidy: A severe error occured while tidying up the received document ("
		                          + sstrTidyResult.str()
		                          + ")."
		                        );
	}
	resultDocument.reserve( outputBuffer.size ); // avoid frequent (re-)allocations
	for( unsigned int i = 0; i < outputBuffer.size; i++ ) {
		resultDocument.insert( resultDocument.end(), static_cast< char >( *(outputBuffer.bp + i) ) );
	}
	tidyBufFree( &outputBuffer );
	tidyBufFree( &errorBuffer );
}
Пример #8
0
QString tidy(QString input)
// take html code and return it converted to xhtml code
{                                                                              
  // the following code is (c) Charles Reitzel and Dave Raggett, see the package tidy                                                                                                                                                             
  TidyBuffer output = {0};                                                                                               
  TidyBuffer errbuf = {0};                                                                                               
  QString result;                                                                                                        
  int rc = -1;                                                                                                           
  Bool ok;                                                                                                               

  TidyDoc tdoc = tidyCreate();                             // Initialize "document"
  ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes );          // Convert to XHTML
  if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf );      // Capture diagnostics
  tidySetCharEncoding( tdoc, "utf8" );
  if ( rc >= 0 ) rc = tidyParseString( tdoc, input.toUtf8().constData() );      // Parse the input    
  if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc );          // Tidy it up!        
  if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc );          // Kvetch             
  if ( rc > 1 )                                            // If error, force output.
    rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );                   
  if ( rc >= 0 ) rc = tidySaveBuffer( tdoc, &output );     // Pretty Print           
  if ( rc >= 0 )                                                                     
  {                                                                                                
    char* outputstring; // content of the outputfile                                 

    // find out length of outputstring
    int length=0; // length of outputstring
    byte* string=output.bp;                
    while (*string)                        
    {                                      
      string++;                                                
      length++;                                                
    }                                                          

    outputstring=(char*)malloc(length);        
    snprintf(outputstring,length,"%s",output.bp);
    result=QString::fromUtf8(outputstring,length);
  }                                                    
  else                                                 
    printf( "A severe error (\%d) occurred.\\n", rc ); 
  tidyBufFree( &output );                              
  tidyBufFree( &errbuf );                              
  tidyRelease( tdoc );
  result=result.replace("&Atilde;&para;","&ouml;");
  return result;                                       
}
Пример #9
0
bool CCFHtmlTidy::TidyMain(const char* pSourceIn, const char* pOptions, std::string &strOut, std::string &strErr)
{
	TidyBuffer output;
	TidyBuffer errbuf;
	int rc = -1;
	Bool ok = yes;

	TidyDoc tdoc = tidyCreate();                     // Initialize "document"
	tidyBufInit(&output);
	tidyBufInit(&errbuf);

	TidyOptionsSet(tidyDocToImpl(tdoc), pOptions);

	if (ok)
		rc = tidySetErrorBuffer(tdoc, &errbuf);      // Capture diagnostics
	if (rc >= 0)
		rc = tidyParseString(tdoc, pSourceIn);           // Parse the input
	if (rc >= 0)
		rc = tidyCleanAndRepair(tdoc);               // Tidy it up!
	if (rc >= 0)
		rc = tidyRunDiagnostics(tdoc);               // Kvetch
	//if ( rc > 1 )                                    // If error, force output.
	//	rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
	if (rc >= 0)
		rc = tidySaveBuffer(tdoc, &output);          // Pretty Print

	if (rc >= 0)
	{
		if (output.bp)
		{
			strOut = reinterpret_cast<char const*>(output.bp);
		}
	}

	strErr = reinterpret_cast<char const*>(errbuf.bp);
	std::string strEmpty = "No warnings or errors were found.\n\n";
	if (0 == strEmpty.compare(strErr))
	{
		strErr.clear();
	}
	tidyBufFree(&output);
	tidyBufFree(&errbuf);
	tidyRelease(tdoc);
	return true;
}
Пример #10
0
bool HTidyInterface::formatSource( const char* textIn, CString &strTidy, CString &strMsg )
{
	TidyBuffer output;
	TidyBuffer errbuf;	
	int rc = -1;
	Bool ok = yes;

	TidyDoc tdoc = tidyCreate();                     // Initialize "document"
	tidyBufInit(&output);
	tidyBufInit(&errbuf);

	InitTidyDefault(tdoc);
	SetTidyConfig(tdoc);

	if ( ok )
		rc = tidySetErrorBuffer(tdoc, &errbuf);      // Capture diagnostics
	if ( rc >= 0 )
		rc = tidyParseString(tdoc, textIn);           // Parse the input
	if ( rc >= 0 )
		rc = tidyCleanAndRepair(tdoc);               // Tidy it up!
	if ( rc >= 0 )
		rc = tidyRunDiagnostics(tdoc);               // Kvetch
	//if ( rc > 1 )                                    // If error, force output.
	//	rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
	if ( rc >= 0 )
		rc = tidySaveBuffer(tdoc, &output);          // Pretty Print

	if ( rc >= 0 )
	{		
		strTidy = reinterpret_cast< char const* >(output.bp);
	}

	strMsg = reinterpret_cast< char const* >(errbuf.bp);
	CString strEmpty = _T("No warnings or errors were found.\r\n\r\n");
	if (0 == strEmpty.Compare(strMsg))
	{
		strMsg.Empty();
	}
	tidyBufFree(&output);
	tidyBufFree(&errbuf);
	tidyRelease(tdoc);
	return true;
}
Пример #11
0
int main(int argc, char **argv )
{
    const char* input = "<title>Hello</title><p>World!";
    TidyBuffer output = {0};
    TidyBuffer errbuf = {0};
    int rc = -1;
    Bool ok;

    // Initialize "document"
    TidyDoc tdoc = tidyCreate();
    printf( "Tidying:\t%s\n", input );

    // Convert to XHTML
    ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes );  
    if ( ok )
        rc = tidySetErrorBuffer( tdoc, &errbuf );    // Capture diagnostics
    if ( rc >= 0 )
        rc = tidyParseString( tdoc, input );         // Parse the input
    if ( rc >= 0 )
        rc = tidyCleanAndRepair( tdoc );             // Tidy it up!
    if ( rc >= 0 )
        rc = tidyRunDiagnostics( tdoc );             // Kvetch
    if ( rc > 1 )                                    // If error, force output.
        rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
    if ( rc >= 0 )
        rc = tidySaveBuffer( tdoc, &output );        // Pretty Print

    if ( rc >= 0 )
    {
    if ( rc > 0 )
        printf( "\nDiagnostics:\n\n%s", errbuf.bp );
    printf( "\nAnd here is the result:\n\n%s", output.bp );
    }
    else
        printf( "A severe error (%d) occurred.\n", rc );

    tidyBufFree( &output );
    tidyBufFree( &errbuf );
    tidyRelease( tdoc );
    return rc;
}
Пример #12
0
Файл: tdoc.c Проект: nuxlli/wax
// quick and dirty shortcut function.
int lua_tidy_easyClean ( lua_State *L )
{
    TidyBuffer output;
    TidyBuffer errbuf;
    int rc;
	pTidy t;
	const char * input;

    tidyBufInit(&output);
    tidyBufInit(&errbuf);
    
    rc = -1;
    
    t = toTidy(L,1);
    input = lua_tostring(L,2);
    
    rc = tidySetErrorBuffer( t->tdoc, &errbuf );
    if ( rc >= 0 )
        rc = tidyParseString( t->tdoc, input );
    if ( rc >= 0 )
        rc = tidyCleanAndRepair( t->tdoc );
    if ( rc >= 0 )
        rc = tidyRunDiagnostics( t->tdoc );
    if ( rc >= 0 )
        rc = tidySaveBuffer( t->tdoc, &output );    
    
    lua_pushlstring(L, (char*)output.bp,output.size);
    if ( rc != 0  )
        lua_pushlstring(L, (char*)errbuf.bp,errbuf.size);
    else
        lua_pushnil(L);
    
    lua_pushnumber(L, rc);
    
    tidyBufFree( &output );
    tidyBufFree( &errbuf );
    
    
    return 3;
}
Пример #13
0
void html_parse(const gchar* html, GSList** objs) {
	TidyDoc tdoc = tidyCreate();
	TidyBuffer tidy_errbuf = {0};
	int err = 0;
  
	tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ 
	tidyOptSetInt(tdoc, TidyWrapLen, 4096);
	tidySetErrorBuffer( tdoc, &tidy_errbuf );
    
	err = tidyParseString(tdoc, html); /* parse the input */ 
	
	if ( err >= 0 ) {
		err = tidyCleanAndRepair(tdoc); /* fix any problems */ 
		
		if ( err >= 0 ) {
			err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ 
			
			if ( err >= 0 ) {
				html_find_objects(tidyGetHtml(tdoc), objs); /* walk the tree */ 
			}
		}
	}
}
Пример #14
0
static PyObject*
elementtidy_fixup(PyObject* self, PyObject* args)
{
    int rc;
    TidyDoc doc;
    TidyBuffer out = {0};
    TidyBuffer err = {0};
    PyObject* pyout;
    PyObject* pyerr;

    char* text;
    char* encoding = NULL;
    if (!PyArg_ParseTuple(args, "s|s:fixup", &text, &encoding))
        return NULL;

    doc = tidyCreate();

    /* options for nice XHTML output */
    if (encoding)
        /* if an encoding is given, use it for both input and output */
        tidyOptSetValue(doc, TidyCharEncoding, encoding);
    else
        /* if no encoding is given, use default input and utf-8 output */
        tidyOptSetValue(doc, TidyOutCharEncoding, "utf8");
    tidyOptSetBool(doc, TidyForceOutput, yes);
    tidyOptSetInt(doc, TidyWrapLen, 0);
    tidyOptSetBool(doc, TidyQuiet, yes);
    tidyOptSetBool(doc, TidyXhtmlOut, yes);
    tidyOptSetBool(doc, TidyXmlDecl, yes);
    tidyOptSetInt(doc, TidyIndentContent, 0);
    tidyOptSetBool(doc, TidyNumEntities, yes);

    rc = tidySetErrorBuffer(doc, &err);
    if (rc < 0) {
        PyErr_SetString(PyExc_IOError, "tidySetErrorBuffer failed");
        goto error;
    }

    rc = tidyParseString(doc, text);
    if (rc < 0) {
        PyErr_SetString(PyExc_IOError, "tidyParseString failed");
        goto error;
    }

    rc = tidyCleanAndRepair(doc);
    if (rc < 0) {
        PyErr_SetString(PyExc_IOError, "tidyCleanAndRepair failed");
        goto error;
    }

    rc = tidyRunDiagnostics(doc);
    if (rc < 0) {
        PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed");
        goto error;
    }

    rc = tidySaveBuffer(doc, &out);
    if (rc < 0) {
        PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed");
        goto error;
    }


    pyout = PyString_FromString(out.bp ? out.bp : "");
    if (!pyout)
        goto error;
    pyerr = PyString_FromString(err.bp ? err.bp : "");
    if (!pyerr) {
        Py_DECREF(pyout);
        goto error;
    }

    tidyBufFree(&out);
    tidyBufFree(&err);

    tidyRelease(doc);

    return Py_BuildValue("NN", pyout, pyerr);

  error:
    tidyBufFree(&out);
    tidyBufFree(&err);

    tidyRelease(doc);

    return NULL;
}
Пример #15
0
bool nuiHTML::Load(nglIStream& rStream, nglTextEncoding OverrideContentsEncoding, const nglString& rSourceURL)
{
  if (!rSourceURL.IsEmpty())
    SetSourceURL(rSourceURL);
  
  int res = -1;
  nglTextEncoding encoding = eUTF8;
  TidyDoc tdoc = NULL;
  {
    HTMLStream strm(rStream);
    tdoc = tidyCreate();
    tidyOptSetBool(tdoc, TidyShowMarkup, no);
    tidyOptSetBool(tdoc, TidyShowWarnings, no);
    tidyOptSetInt(tdoc, TidyShowErrors, 0);
    tidyOptSetBool(tdoc, TidyQuiet, yes);
    tidySetCharEncoding(tdoc, "utf8");
    
    TidyInputSource source;
    tidyInitSource( &source, &strm, &HTMLStream::TidyGetByte, &HTMLStream::TidyUngetByte, &HTMLStream::TidyEOF);
    res = tidyParseSource(tdoc, &source);
    
    if ( res >= 0 )
      res = tidyCleanAndRepair(tdoc);               // Tidy it up!
    if ( res >= 0 )
      res = tidyRunDiagnostics(tdoc);               // Kvetch
  
    if (OverrideContentsEncoding == eEncodingUnknown)
    {
      nglString encoding_string(GetEncodingString(tidyGetRoot(tdoc)));
      
      //ascii, latin1, raw, utf8, iso2022, mac, win1252, utf16le, utf16be, utf16, big5 shiftjis
      encoding = nuiGetTextEncodingFromString(encoding_string);
    }
    else
    {
      encoding = OverrideContentsEncoding;
    }
  }
  
  char* pStr = NULL;

  if (encoding != eUTF8)
  {
    // Release the doc to create a new one
    tidyRelease(tdoc);
    
    nglOMemory omem;
    rStream.SetPos(0, eStreamFromStart);
    rStream.PipeTo(omem);
    nglString decoded;
    decoded.Import(omem.GetBufferData(), omem.GetSize(), encoding);
    pStr = decoded.Export(eUTF8);
    nglIMemory imem(pStr, strlen(pStr));
    
    HTMLStream strm(imem);
    tdoc = tidyCreate();
    tidySetCharEncoding(tdoc, "utf8");

    TidyInputSource source;
    tidyInitSource( &source, &strm, &HTMLStream::TidyGetByte, &HTMLStream::TidyUngetByte, &HTMLStream::TidyEOF);
    res = tidyParseSource(tdoc, &source);
    if ( res >= 0 )
      res = tidyCleanAndRepair(tdoc);               // Tidy it up!
    if ( res >= 0 )
      res = tidyRunDiagnostics(tdoc);               // Kvetch
  }    
    
  BuildTree(tdoc, tidyGetRoot(tdoc), eUTF8, mComputeStyle);
  
  tidyRelease(tdoc);
  
  if (pStr)
    free(pStr);
  
  return res < 2;
}
Пример #16
0
int CProxyParse::RunFromMem( wxString content )
{
	char *pBuffer;
	//http://www.51proxied.com/http_non_anonymous.html
	//wxString path = wxT("f:/work/windows/wxUrlRefresh/data/最新透明HTTP代理服务器.htm");
	//wxString path1 = wxT("f:/work/windows/wxUrlRefresh/data/result.xml");

	wxString data_path = wxGetCwd() + "/data/";
	wxString path1 = data_path + "_tmp.xml";

	if (!wxDirExists(data_path))
		wxMkdir(data_path);

	pBuffer = (char*)calloc(content.Length()+1, 1);
	wxStrncpy(pBuffer, content, content.Len()+1);


	wxLogMessage("Run Tidy!");
	TidyBuffer output;
	TidyBuffer errbuf;
	int rc = -1;
	Bool ok;
	TidyDoc tdoc = tidyCreate();                     // Initialize "document"
	tidyBufInit( &output );
	tidyBufInit( &errbuf );
	//printf( "Tidying:\t\%s\\n", input );
	tidySetCharEncoding(tdoc, "utf8");
	ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes );  // Convert to XHTML
	if ( ok )
		rc = tidySetErrorBuffer( tdoc, &errbuf );      // Capture diagnostics
	if ( rc >= 0 )
		rc = tidyParseString( tdoc, pBuffer );           // Parse the input
	if ( rc >= 0 )
		rc = tidyCleanAndRepair( tdoc );               // Tidy it up!
	if ( rc >= 0 )
		rc = tidyRunDiagnostics( tdoc );               // Kvetch
	if ( rc > 1 )                                    // If error, force output.
		rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
	if ( rc >= 0 )
		rc = tidySaveBuffer( tdoc, &output );          // Pretty Print
	if ( rc >= 0 )
	{
#ifdef _DEBUG
		//if ( rc > 0 )
		//	WriteAllToFile("f:/work/windows/wxUrlRefresh/data/error.xml", (char*)errbuf.bp, errbuf.size);
		WriteAllToFile(path1, (char*)output.bp, output.size);
#endif

	}
	else
		wxLogError("tidyFail");

	tidyBufFree( &output );
	tidyBufFree( &errbuf );
	tidyRelease( tdoc );
	if (pBuffer) free(pBuffer);


	wxLogMessage("Fetch data!");
	// 解析数据
	TiXmlDocument doc(path1);
	if (doc.LoadFile()) 
	{
		// root
		CTiXmlProxyVistor vistor(&m_array);
		TiXmlElement *pRoot = doc.RootElement();
		pRoot->Accept(&vistor);
	}
	else
	{
		wxLogMessage("shit");
		return -2;
	}
	return 0;
}
Пример #17
0
	/*!
	* \fn static int TidyHtml(const char *pcSourcePage, string &sDestPage);
	* \brief  修补丢失、错误标签
	* \param  [in]待修补网页字符串
	* \param  [out]修补后的网页string
	* \return 结果码,==0修补正确,<0修补失败
	* \date   2011-06-01 
	* \author nanjunxiao
	*/
	int Pretreat::TidyHtml(const char *pcSourcePage, std::string &sDestPage)
	{
		int iReturn = 0;
		TidyBuffer errbuf = {0};
		TidyDoc tdoc;
		tmbstr pBuffer = NULL;

		try
		{
			if ( (pcSourcePage == NULL) || (strlen(pcSourcePage) ==0 ) )
			{
				//cerr << "TidyHtml 输入页面为空!" << endl;
				throw (-1);
			}

			int iRet = -1;
			Bool bOk;
			uint uiBufLen;
			int iBufSize;
			tdoc = tidyCreate();// Initialize "document"
			bOk = tidyOptSetBool(tdoc, TidyXhtmlOut, yes);// Convert to XHTML
			if (bOk)
			{
				iRet = tidySetErrorBuffer(tdoc, &errbuf); // Capture diagnostics
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				iRet = tidySetCharEncoding(tdoc,"utf8"); //Ensure dealing with gb2312 successfully
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				string htmlsrc = pcSourcePage;
				iRet = tidyParseString (tdoc, htmlsrc.c_str() ); // Parse the input
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				iRet = tidyCleanAndRepair(tdoc); //Tidy it up!
			}
			else
			{
				throw (-1);
			}
			
			if (iRet >= 0)
			{
				iRet = tidyRunDiagnostics(tdoc); //Kvetch
			}
			else
			{
				throw (-1);
			}

			if(iRet > 1) // If error, force output.
			{
				iRet = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? iRet : -1 );
			}
			else if (iRet < 0)
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				// Pretty Print
				iBufSize = 1024 * 1024 * 5;
				uiBufLen = iBufSize;
				pBuffer = new char [iBufSize];
				memset(pBuffer, '\0', iBufSize);
				iRet = tidySaveString(tdoc, pBuffer, &uiBufLen);
			}
			else
			{
				throw (-1);
			}

			if (iRet >= 0)
			{
				sDestPage = pBuffer;
			}
			else if (iRet == -ENOMEM)
			{
				//pBuffer 长度不够
				//cerr << "TidyHtml pBuffer长度不够!" << endl;
				throw (-1);
			}
			else
			{
				throw (-1);
			}
		}
		catch(exception &err)
		{
			//cerr << "TidyHtml HtmlTidy修补页面失败! " << err.what() << endl;
			iReturn = -1;
		}
		catch(int iThrow)
		{
			if (iThrow < 0)
			{
				//cerr << "TidyHtml HtmlTidy修补页面失败!" << endl;
			}
			iReturn = iThrow;
		}
		catch(...)
		{
			//cerr << "TidyHtml HtmlTidy修补页面失败!" << endl;
			iReturn = -1;
		}

		tidyBufFree(&errbuf);
		tidyRelease(tdoc);
		if (pBuffer != NULL)
		{
			delete [] pBuffer;
			pBuffer = NULL;
		}
		return iReturn;
	}
Пример #18
0
bool TidyReader::openFile (const char * szFilename)
{
    UT_DEBUGMSG(("using libtidy to parse HTML...\n"));

    m_tidy = tidyCreate ();
    if (m_tidy == 0) return false;

    if (tidyOptSetBool (m_tidy, TidyXhtmlOut, yes) == 0)
    {
        UT_DEBUGMSG(("tidyOptSetBool failed!\n"));
        closeFile ();
        return false;
    }
#ifndef DEBUG
    tidySetErrorBuffer (m_tidy, &m_errbuf);
#endif

    int parse_status;
    if (m_buffer && m_length)
    {
        UT_DEBUGMSG(("parse HTML in buffer...\n"));

        UT_Byte * buffer = const_cast<UT_Byte *>(m_buffer); // grr.

        TidyBuffer inbuf;

        tidyBufInit (&inbuf);
        tidyBufAttach (&inbuf, buffer, static_cast<unsigned int>(m_length));

        parse_status = tidyParseBuffer (m_tidy, &inbuf);

        tidyBufDetach (&inbuf);
    }
    else
    {
        UT_DEBUGMSG(("parse HTML in file: %s\n",szFilename));
        parse_status = tidyParseFile (m_tidy, szFilename);
    }
    if (parse_status < 0)
    {
        UT_DEBUGMSG(("tidyParseBuffer/File failed!\n"));
        closeFile ();
        return false;
    }

    parse_status = tidyCleanAndRepair (m_tidy);
    if (parse_status < 0)
    {
        UT_DEBUGMSG(("tidyCleanAndRepair failed!\n"));
        closeFile ();
        return false;
    }

    parse_status = tidyRunDiagnostics (m_tidy);
    if (parse_status < 0)
    {
        UT_DEBUGMSG(("tidyRunDiagnostics failed!\n"));
        closeFile ();
        return false;
    }

    if (parse_status > 1)
    {
        parse_status = (tidyOptSetBool (m_tidy, TidyForceOutput, yes) ? parse_status : -1);
    }
    if (parse_status < 0)
    {
        UT_DEBUGMSG(("tidyOptSetBool failed!\n"));
        closeFile ();
        return false;
    }

    parse_status = tidySaveBuffer (m_tidy, &m_outbuf);
    if (parse_status < 0)
    {
        UT_DEBUGMSG(("tidySaveBuffer failed!\n"));
        closeFile ();
        return false;
    }
    UT_DEBUGMSG(("tidy succeeded!\n"));
#ifdef DEBUG
    fputs ("================================================================\n", stderr);
    fputs ((const char *) m_outbuf.bp, stderr);
    fputs ("================================================================\n", stderr);
#endif
    m_outbuf.next = 0;

    return true;
}
Пример #19
0
int main( int argc, char** argv )
{
    ctmbstr prog = argv[0];
    ctmbstr cfgfil = NULL, errfil = NULL, htmlfil = NULL;
    TidyDoc tdoc = tidyCreate();
    int status = 0;

    uint contentErrors = 0;
    uint contentWarnings = 0;
    uint accessWarnings = 0;

    errout = stderr;  /* initialize to stderr */
    status = 0;
    
#ifdef TIDY_CONFIG_FILE
    if ( tidyFileExists( tdoc, TIDY_CONFIG_FILE) )
    {
        status = tidyLoadConfig( tdoc, TIDY_CONFIG_FILE );
        if ( status != 0 )
            fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", TIDY_CONFIG_FILE, status);
    }
#endif /* TIDY_CONFIG_FILE */

    /* look for env var "HTML_TIDY" */
    /* then for ~/.tidyrc (on platforms defining $HOME) */

    if ( (cfgfil = getenv("HTML_TIDY")) != NULL )
    {
        status = tidyLoadConfig( tdoc, cfgfil );
        if ( status != 0 )
            fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", cfgfil, status);
    }
#ifdef TIDY_USER_CONFIG_FILE
    else if ( tidyFileExists( tdoc, TIDY_USER_CONFIG_FILE) )
    {
        status = tidyLoadConfig( tdoc, TIDY_USER_CONFIG_FILE );
        if ( status != 0 )
            fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", TIDY_USER_CONFIG_FILE, status);
    }
#endif /* TIDY_USER_CONFIG_FILE */

    /* read command line */
    while ( argc > 0 )
    {
        if (argc > 1 && argv[1][0] == '-')
        {
            /* support -foo and --foo */
            ctmbstr arg = argv[1] + 1;

            if ( strcasecmp(arg, "xml") == 0)
                tidyOptSetBool( tdoc, TidyXmlTags, yes );

            else if ( strcasecmp(arg,   "asxml") == 0 ||
                      strcasecmp(arg, "asxhtml") == 0 )
            {
                tidyOptSetBool( tdoc, TidyXhtmlOut, yes );
            }
            else if ( strcasecmp(arg,   "ashtml") == 0 )
                tidyOptSetBool( tdoc, TidyHtmlOut, yes );

            else if ( strcasecmp(arg, "indent") == 0 )
            {
                tidyOptSetInt( tdoc, TidyIndentContent, TidyAutoState );
                if ( tidyOptGetInt(tdoc, TidyIndentSpaces) == 0 )
                    tidyOptResetToDefault( tdoc, TidyIndentSpaces );
            }
            else if ( strcasecmp(arg, "omit") == 0 )
                tidyOptSetBool( tdoc, TidyHideEndTags, yes );

            else if ( strcasecmp(arg, "upper") == 0 )
                tidyOptSetBool( tdoc, TidyUpperCaseTags, yes );

            else if ( strcasecmp(arg, "clean") == 0 )
                tidyOptSetBool( tdoc, TidyMakeClean, yes );

            else if ( strcasecmp(arg, "bare") == 0 )
                tidyOptSetBool( tdoc, TidyMakeBare, yes );

            else if ( strcasecmp(arg, "raw") == 0      ||
                      strcasecmp(arg, "ascii") == 0    ||
                      strcasecmp(arg, "latin0") == 0   ||
                      strcasecmp(arg, "latin1") == 0   ||
                      strcasecmp(arg, "utf8") == 0     ||
#ifndef NO_NATIVE_ISO2022_SUPPORT
                      strcasecmp(arg, "iso2022") == 0  ||
#endif
#if SUPPORT_UTF16_ENCODINGS
                      strcasecmp(arg, "utf16le") == 0  ||
                      strcasecmp(arg, "utf16be") == 0  ||
                      strcasecmp(arg, "utf16") == 0    ||
#endif
#if SUPPORT_ASIAN_ENCODINGS
                      strcasecmp(arg, "shiftjis") == 0 ||
                      strcasecmp(arg, "big5") == 0     ||
#endif
                      strcasecmp(arg, "mac") == 0      ||
                      strcasecmp(arg, "win1252") == 0  ||
                      strcasecmp(arg, "ibm858") == 0 )
            {
                tidySetCharEncoding( tdoc, arg );
            }
            else if ( strcasecmp(arg, "numeric") == 0 )
                tidyOptSetBool( tdoc, TidyNumEntities, yes );

            else if ( strcasecmp(arg, "modify") == 0 ||
                      strcasecmp(arg, "change") == 0 ||  /* obsolete */
                      strcasecmp(arg, "update") == 0 )   /* obsolete */
            {
                tidyOptSetBool( tdoc, TidyWriteBack, yes );
            }
            else if ( strcasecmp(arg, "errors") == 0 )
                tidyOptSetBool( tdoc, TidyShowMarkup, no );

            else if ( strcasecmp(arg, "quiet") == 0 )
                tidyOptSetBool( tdoc, TidyQuiet, yes );

            else if ( strcasecmp(arg, "help") == 0 ||
                      strcasecmp(arg,    "h") == 0 || *arg == '?' )
            {
                help( prog );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "xml-help") == 0)
            {
                xml_help( );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "help-config") == 0 )
            {
                optionhelp( tdoc );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "xml-config") == 0 )
            {
                XMLoptionhelp( tdoc );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "show-config") == 0 )
            {
                optionvalues( tdoc );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "config") == 0 )
            {
                if ( argc >= 3 )
                {
                    ctmbstr post;

                    tidyLoadConfig( tdoc, argv[2] );

                    /* Set new error output stream if setting changed */
                    post = tidyOptGetValue( tdoc, TidyErrFile );
                    if ( post && (!errfil || !samefile(errfil, post)) )
                    {
                        errfil = post;
                        errout = tidySetErrorFile( tdoc, post );
                    }

                    --argc;
                    ++argv;
                }
            }

#if SUPPORT_ASIAN_ENCODINGS
            else if ( strcasecmp(arg, "language") == 0 ||
                      strcasecmp(arg,     "lang") == 0 )
            {
                if ( argc >= 3 )
                {
                    tidyOptSetValue( tdoc, TidyLanguage, argv[2] );
                    --argc;
                    ++argv;
                }
            }
#endif

            else if ( strcasecmp(arg, "output") == 0 ||
                      strcasecmp(arg, "-output-file") == 0 ||
                      strcasecmp(arg, "o") == 0 )
            {
                if ( argc >= 3 )
                {
                    tidyOptSetValue( tdoc, TidyOutFile, argv[2] );
                    --argc;
                    ++argv;
                }
            }
            else if ( strcasecmp(arg,  "file") == 0 ||
                      strcasecmp(arg, "-file") == 0 ||
                      strcasecmp(arg,     "f") == 0 )
            {
                if ( argc >= 3 )
                {
                    errfil = argv[2];
                    errout = tidySetErrorFile( tdoc, errfil );
                    --argc;
                    ++argv;
                }
            }
            else if ( strcasecmp(arg,  "wrap") == 0 ||
                      strcasecmp(arg, "-wrap") == 0 ||
                      strcasecmp(arg,     "w") == 0 )
            {
                if ( argc >= 3 )
                {
                    uint wraplen = 0;
                    int nfields = sscanf( argv[2], "%u", &wraplen );
                    tidyOptSetInt( tdoc, TidyWrapLen, wraplen );
                    if (nfields > 0)
                    {
                        --argc;
                        ++argv;
                    }
                }
            }
            else if ( strcasecmp(arg,  "version") == 0 ||
                      strcasecmp(arg, "-version") == 0 ||
                      strcasecmp(arg,        "v") == 0 )
            {
                version();
                tidyRelease( tdoc );
                return 0;  /* success */

            }
            else if ( strncmp(argv[1], "--", 2 ) == 0)
            {
                if ( tidyOptParseValue(tdoc, argv[1]+2, argv[2]) )
                {
                    /* Set new error output stream if setting changed */
                    ctmbstr post = tidyOptGetValue( tdoc, TidyErrFile );
                    if ( post && (!errfil || !samefile(errfil, post)) )
                    {
                        errfil = post;
                        errout = tidySetErrorFile( tdoc, post );
                    }

                    ++argv;
                    --argc;
                }
            }

#if SUPPORT_ACCESSIBILITY_CHECKS
            else if ( strcasecmp(arg, "access") == 0 )
            {
                if ( argc >= 3 )
                {
                    uint acclvl = 0;
                    int nfields = sscanf( argv[2], "%u", &acclvl );
                    tidyOptSetInt( tdoc, TidyAccessibilityCheckLevel, acclvl );
                    if (nfields > 0)
                    {
                        --argc;
                        ++argv;
                    }
                }
            }
#endif

            else
            {
                uint c;
                ctmbstr s = argv[1];

                while ( (c = *++s) != '\0' )
                {
                    switch ( c )
                    {
                    case 'i':
                        tidyOptSetInt( tdoc, TidyIndentContent, TidyAutoState );
                        if ( tidyOptGetInt(tdoc, TidyIndentSpaces) == 0 )
                            tidyOptResetToDefault( tdoc, TidyIndentSpaces );
                        break;

                    /* Usurp -o for output file.  Anyone hiding end tags?
                    case 'o':
                        tidyOptSetBool( tdoc, TidyHideEndTags, yes );
                        break;
                    */

                    case 'u':
                        tidyOptSetBool( tdoc, TidyUpperCaseTags, yes );
                        break;

                    case 'c':
                        tidyOptSetBool( tdoc, TidyMakeClean, yes );
                        break;

                    case 'b':
                        tidyOptSetBool( tdoc, TidyMakeBare, yes );
                        break;

                    case 'n':
                        tidyOptSetBool( tdoc, TidyNumEntities, yes );
                        break;

                    case 'm':
                        tidyOptSetBool( tdoc, TidyWriteBack, yes );
                        break;

                    case 'e':
                        tidyOptSetBool( tdoc, TidyShowMarkup, no );
                        break;

                    case 'q':
                        tidyOptSetBool( tdoc, TidyQuiet, yes );
                        break;

                    default:
                        unknownOption( c );
                        break;
                    }
                }
            }

            --argc;
            ++argv;
            continue;
        }

        if ( argc > 1 )
        {
            htmlfil = argv[1];
            if ( tidyOptGetBool(tdoc, TidyEmacs) )
                tidyOptSetValue( tdoc, TidyEmacsFile, htmlfil );
            status = tidyParseFile( tdoc, htmlfil );
        }
        else
        {
            htmlfil = "stdin";
            status = tidyParseStdin( tdoc );
        }

        if ( status >= 0 )
            status = tidyCleanAndRepair( tdoc );

        if ( status >= 0 )
            status = tidyRunDiagnostics( tdoc );

        if ( status > 1 ) /* If errors, do we want to force output? */
            status = ( tidyOptGetBool(tdoc, TidyForceOutput) ? status : -1 );

        if ( status >= 0 && tidyOptGetBool(tdoc, TidyShowMarkup) )
        {
            if ( tidyOptGetBool(tdoc, TidyWriteBack) && argc > 1 )
                status = tidySaveFile( tdoc, htmlfil );
            else
            {
                ctmbstr outfil = tidyOptGetValue( tdoc, TidyOutFile );
                if ( outfil )
                    status = tidySaveFile( tdoc, outfil );
                else
                    status = tidySaveStdout( tdoc );
            }
        }

        contentErrors   += tidyErrorCount( tdoc );
        contentWarnings += tidyWarningCount( tdoc );
        accessWarnings  += tidyAccessWarningCount( tdoc );

        --argc;
        ++argv;

        if ( argc <= 1 )
            break;
    }

    if (!tidyOptGetBool(tdoc, TidyQuiet) &&
        errout == stderr && !contentErrors)
        fprintf(errout, "\n");

    if (contentErrors + contentWarnings > 0 && 
         !tidyOptGetBool(tdoc, TidyQuiet))
        tidyErrorSummary(tdoc);

    if (!tidyOptGetBool(tdoc, TidyQuiet))
        tidyGeneralInfo(tdoc);

    /* called to free hash tables etc. */
    tidyRelease( tdoc );

    /* return status can be used by scripts */
    if ( contentErrors > 0 )
        return 2;

    if ( contentWarnings > 0 )
        return 1;

    /* 0 signifies all is ok */
    return 0;
}
Пример #20
0
Файл: tdoc.c Проект: nuxlli/wax
// returns warnings.
int lua_tidy_runDiagnostics (lua_State *L)
{
    pTidy t = toTidy(L,1);
    lua_pushnumber(L, tidyRunDiagnostics (t->tdoc));
    return 1;    
}