int main(int argc, char *argv[]) { CURL *curl; int counter = 0; if (argc < 2) return 1; curl = curl_easy_init(); TidyDoc tdoc = tidyCreate(); TidyBuffer output = {0}; tidyOptSetBool(tdoc, TidyXmlOut, yes); tidyOptSetBool(tdoc, TidyShowWarnings, no); tidyOptSetInt(tdoc, TidyWrapLen, 0); for(int i=0; i < 20; i++) { // tidyBufFree(&output); tidyBufClear(&output); // tidyParseFile(tdoc, argv[1]); tidyParseString(tdoc, getpage(curl,i).c_str()); tidySaveBuffer(tdoc, &output); // tidySaveFile(tdoc, "tidy_test.xml"); // doc.LoadFile(argv[1]); // doc.LoadFile("tidy_test.xml"); parseTidyBuf(output, counter); } curl_easy_cleanup(curl); return 0; }
Bool TIDY_CALL tidyNodeGetValue( TidyDoc tdoc, TidyNode tnod, TidyBuffer* buf ) { TidyDocImpl *doc = tidyDocToImpl( tdoc ); Node *node = tidyNodeToImpl( tnod ); if ( doc == NULL || node == NULL || buf == NULL ) return no; switch( node->type ) { case TextNode: case CDATATag: case CommentTag: case ProcInsTag: case SectionTag: case AspTag: case JsteTag: case PhpTag: { tidyBufClear( buf ); tidyBufAppend( buf, doc->lexer->lexbuf + node->start, node->end - node->start ); break; } default: /* The node doesn't have a value */ return no; } return yes; }
static void convertNode(TidyNode node, int level, bool opentag) { ctmbstr name; TidyAttr tattr; struct htmlTag *t; int nattr; /* number of attributes */ int i; switch (tidyNodeGetType(node)) { case TidyNode_Text: name = "Text"; break; case TidyNode_Start: case TidyNode_End: case TidyNode_StartEnd: name = tidyNodeGetName(node); break; default: return; } t = newTag((char *)name); if (!t) return; if (!opentag) { t->slash = true; return; } /* if a js script, remember the line number for error messages */ if (t->action == TAGACT_SCRIPT) t->js_ln = tidyNodeLine(node); /* this is the open tag, set the attributes */ /* special case for text tag */ if (t->action == TAGACT_TEXT) { TidyBuffer tnv = { 0 }; /* text-node value */ tidyBufClear(&tnv); tidyNodeGetValue(tdoc, node, &tnv); if (tnv.size) { t->textval = cloneString(tnv.bp); tidyBufFree(&tnv); } } nattr = 0; tattr = tidyAttrFirst(node); while (tattr != NULL) { ++nattr; tattr = tidyAttrNext(tattr); } t->attributes = allocMem(sizeof(char *) * (nattr + 1)); t->atvals = allocMem(sizeof(char *) * (nattr + 1)); i = 0; tattr = tidyAttrFirst(node); while (tattr != NULL) { t->attributes[i] = cloneString(tidyAttrName(tattr)); t->atvals[i] = cloneString(tidyAttrValue(tattr)); ++i; tattr = tidyAttrNext(tattr); } t->attributes[i] = 0; t->atvals[i] = 0; /* innerHTML, only for certain tags */ if (t->info->bits & TAG_INNERHTML) { TidyBuffer tnv = { 0 }; /* text-node value */ tidyBufClear(&tnv); t->innerHTML = emptyString; tidyNodeGetText(tdoc, node, &tnv); if (tnv.size) { /* But it's not the original html, it has been sanitized. * Put a cap on size, else memory consumed could, theoretically, * grow as the size of the document squared. */ if (tnv.size <= 4096) t->innerHTML = cloneString(tnv.bp); tagStrip(t->innerHTML); tidyBufFree(&tnv); } } } /* convertNode */
/* this is strictly for debugging, level >= 5 */ static void printNode(TidyNode node, int level, bool opentag) { ctmbstr name; TidyAttr tattr; if (!opentag) { puts("}"); return; } switch (tidyNodeGetType(node)) { case TidyNode_Root: name = "Root"; break; case TidyNode_DocType: name = "DOCTYPE"; break; case TidyNode_Comment: name = "Comment"; break; case TidyNode_ProcIns: name = "Processing Instruction"; break; case TidyNode_Text: name = "Text"; break; case TidyNode_CDATA: name = "CDATA"; break; case TidyNode_Section: name = "XML Section"; break; case TidyNode_Asp: name = "ASP"; break; case TidyNode_Jste: name = "JSTE"; break; case TidyNode_Php: name = "PHP"; break; case TidyNode_XmlDecl: name = "XML Declaration"; break; case TidyNode_Start: case TidyNode_End: case TidyNode_StartEnd: default: name = tidyNodeGetName(node); break; } assert(name != NULL); printf("Node(%d): %s {\n", level, ((char *)name)); /* the ifs could be combined with && */ if (stringEqual(((char *)name), "Text")) { TidyBuffer tnv = { 0 }; /* text-node value */ tidyBufClear(&tnv); tidyNodeGetValue(tdoc, node, &tnv); printf("Text: %s\n", tnv.bp); if (tnv.size) tidyBufFree(&tnv); } /* Get the first attribute for the node */ tattr = tidyAttrFirst(node); while (tattr != NULL) { /* Print the node and its attribute */ printf("@%s = %s\n", tidyAttrName(tattr), tidyAttrValue(tattr)); /* Get the next attribute */ tattr = tidyAttrNext(tattr); } } /* printNode */