void parseHTMLtoGetURL( const char* fileName ) { int i = 0; xmlDocPtr doc; xmlChar* xpath = (xmlChar *)"//*/a"; xmlXPathObjectPtr result; xmlChar* keyword; doc = htmlParseFile( fileName, "UTF-8" ); if( doc == NULL ){ fprintf( stderr, "Document not parsed successfully.\n" ); return; } result = parsebyXPath( doc, xpath ); if( result == NULL ){ fprintf( stderr, "Not Found XPath:%s\n", (const char *)xpath); return; } for( i = 0; i < result->nodesetval->nodeNr; i++ ){ keyword = xmlNodeListGetString(doc, result->nodesetval->nodeTab[i]->xmlChildrenNode, 1); printf("%s:", keyword ); xmlFree(keyword); keyword = xmlGetProp( result->nodesetval->nodeTab[i], (const xmlChar *)"href" ); printf("%s\n", keyword ); xmlFree(keyword); } xmlXPathFreeObject(result); return; }
/** * Loads an url tree from a specified file. * * \param filename name of file to read * \param tree empty tree which data will be read into * \return the file represented as a tree, or NULL on failure */ bool tree_urlfile_load(const char *filename, struct tree *tree, tree_node_user_callback callback, void *callback_data) { xmlDoc *doc; xmlNode *html, *body, *ul; struct node *root; FILE *fp = NULL; if (filename == NULL) { return false; } fp = fopen(filename, "r"); if (fp == NULL) { return false; } fclose(fp); doc = htmlParseFile(filename, "iso-8859-1"); if (doc == NULL) { warn_user("TreeLoadError", messages_get("ParsingFail")); return false; } html = tree_url_find_xml_element((xmlNode *) doc, "html"); body = tree_url_find_xml_element(html, "body"); ul = tree_url_find_xml_element(body, "ul"); if (ul == NULL) { xmlFreeDoc(doc); warn_user("TreeLoadError", "(<html>...<body>...<ul> not found.)"); return false; } root = tree_get_root(tree); tree_url_load_directory(ul, tree, root, callback, callback_data); tree_set_node_expanded(tree, root, true, false, false); xmlFreeDoc(doc); return true; }
void htmlp_get_link(spider_t *sp, const char *fname, const char *encoding) { htmlDocPtr docp; xmlXPathContextPtr context; int i=0, j=0; xmlXPathObjectPtr result; xmlNodeSetPtr nodeset; struct _xmlAttr *attrs; char *url; char complete_url[1024] = {0}; url_queue_t *uqueue = sp->urlq; char *pos = NULL; char *cur_chr = NULL; char *url_tmp_buf = NULL; /* printf("*******************************\n"); printf("%s\n", sp->cur_url); printf("*******************************\n"); */ docp = htmlParseFile(fname, encoding); context = xmlXPathNewContext(docp); result = xmlXPathEvalExpression("//a", context); xmlXPathFreeContext(context); if(!xmlXPathNodeSetIsEmpty(result->nodesetval)) { nodeset = result->nodesetval; for(i=0; i < nodeset->nodeNr; i++) { attrs = nodeset->nodeTab[i]->properties; while(attrs) { if (strncasecmp(attrs->name, "href", 4) == 0) { url = attrs->children->content; url_tmp_buf = calloc(1, strlen(url) + 1); struct struct_parts uobj; memcpy(url_tmp_buf, url, strlen(url)); printf("===================\n"); printf("url:%s\n", url); urlp_parse(url_tmp_buf, &uobj); printf("access: %s\n", uobj.access); printf("host:%s\n", uobj.host); printf("absolute:%s\n", uobj.absolute); printf("relative:%s\n", uobj.relative); printf("search:%s\n", uobj.search); printf("anchor:%s\n", uobj.anchor); printf("===================\n"); if (uobj.access != NULL) { if (strcasecmp(uobj.access, "javascript") != 0) { if (uobj.host != NULL) { if (memcmp(uobj.host, (sp->root_url_obj).host, strlen(uobj.host)) == 0) { url_queue_uniq_add(uqueue, url); } } else { url_queue_uniq_add(uqueue, url); } } } else { if (uobj.access == NULL) uobj.access = (sp->root_url_obj).access; if (uobj.host == NULL) uobj.host = (sp->root_url_obj).host; if (uobj.relative || uobj.absolute) { memset(complete_url, 0, 1024); if (uobj.relative) { pos = strrchr(sp->cur_url, '/'); if (*(pos-1) == '/' && *(pos-2) == ':') { snprintf(complete_url, 1023, "%s://%s", uobj.access, uobj.host); complete_url[strlen(complete_url)] = '/'; strncat(complete_url, uobj.relative, 1023); } else { for (j=0, cur_chr = sp->cur_url; cur_chr < pos + 1; cur_chr++, j++) { complete_url[j] = *cur_chr; } strncat(complete_url, uobj.relative, 1023); } } if (uobj.absolute) { snprintf(complete_url, 1023, "%s://%s", uobj.access, uobj.host); complete_url[strlen(complete_url)] = '/'; strncat(complete_url, uobj.absolute, 1023); } } else { memset(complete_url, 0, 1024); snprintf(complete_url, 1023, "%s://%s", uobj.access, uobj.host); complete_url[strlen(complete_url)] = '/'; } if (uobj.search) { complete_url[strlen(complete_url)] = '?'; strncat(complete_url, uobj.search, 1023); } if (uobj.anchor) { complete_url[strlen(complete_url)] = '#'; strncat(complete_url, uobj.anchor, 1023); } printf("******preurl: %s\n", complete_url); urlp_strip_dot(complete_url); printf("******passurl: %s\n", complete_url); url_queue_uniq_add(uqueue, complete_url); } free(url_tmp_buf); break; } attrs = attrs->next; } } } xmlXPathFreeObject (result); xmlCleanupParser(); }
int main(int argc, char **argv) { int i; xsltStylesheetPtr cur = NULL; xmlDocPtr doc, style; if (argc <= 1) { usage(argv[0]); return (1); } xmlInitMemory(); LIBXML_TEST_VERSION defaultLoader = xmlGetExternalEntityLoader(); xmlLineNumbersDefault(1); if (novalid == 0) /* TODO XML_DETECT_IDS | XML_COMPLETE_ATTRS */ xmlLoadExtDtdDefaultValue = 6; else xmlLoadExtDtdDefaultValue = 0; for (i = 1; i < argc; i++) { if (!strcmp(argv[i], "-")) break; if (argv[i][0] != '-') continue; #ifdef LIBXML_DEBUG_ENABLED if ((!strcmp(argv[i], "-debug")) || (!strcmp(argv[i], "--debug"))) { debug++; } else #endif if ((!strcmp(argv[i], "-v")) || (!strcmp(argv[i], "-verbose")) || (!strcmp(argv[i], "--verbose"))) { xsltSetGenericDebugFunc(stderr, NULL); } else if ((!strcmp(argv[i], "-o")) || (!strcmp(argv[i], "-output")) || (!strcmp(argv[i], "--output"))) { i++; output = argv[i++]; } else if ((!strcmp(argv[i], "-V")) || (!strcmp(argv[i], "-version")) || (!strcmp(argv[i], "--version"))) { printf("Using libxml %s, libxslt %s and libexslt %s\n", xmlParserVersion, xsltEngineVersion, exsltLibraryVersion); printf ("xsltproc was compiled against libxml %d, libxslt %d and libexslt %d\n", LIBXML_VERSION, LIBXSLT_VERSION, LIBEXSLT_VERSION); printf("libxslt %d was compiled against libxml %d\n", xsltLibxsltVersion, xsltLibxmlVersion); printf("libexslt %d was compiled against libxml %d\n", exsltLibexsltVersion, exsltLibxmlVersion); } else if ((!strcmp(argv[i], "-repeat")) || (!strcmp(argv[i], "--repeat"))) { if (repeat == 0) repeat = 20; else repeat = 100; } else if ((!strcmp(argv[i], "-novalid")) || (!strcmp(argv[i], "--novalid"))) { novalid++; } else if ((!strcmp(argv[i], "-noout")) || (!strcmp(argv[i], "--noout"))) { noout++; #ifdef LIBXML_DOCB_ENABLED } else if ((!strcmp(argv[i], "-docbook")) || (!strcmp(argv[i], "--docbook"))) { docbook++; #endif #ifdef LIBXML_HTML_ENABLED } else if ((!strcmp(argv[i], "-html")) || (!strcmp(argv[i], "--html"))) { html++; #endif } else if ((!strcmp(argv[i], "-timing")) || (!strcmp(argv[i], "--timing"))) { timing++; } else if ((!strcmp(argv[i], "-profile")) || (!strcmp(argv[i], "--profile"))) { profile++; } else if ((!strcmp(argv[i], "-norman")) || (!strcmp(argv[i], "--norman"))) { profile++; } else if ((!strcmp(argv[i], "-warnnet")) || (!strcmp(argv[i], "--warnnet"))) { xmlSetExternalEntityLoader(xsltNoNetExternalEntityLoader); } else if ((!strcmp(argv[i], "-nonet")) || (!strcmp(argv[i], "--nonet"))) { xmlSetExternalEntityLoader(xsltNoNetExternalEntityLoader); nonet = 1; #ifdef LIBXML_CATALOG_ENABLED } else if ((!strcmp(argv[i], "-catalogs")) || (!strcmp(argv[i], "--catalogs"))) { const char *catalogs; catalogs = getenv("SGML_CATALOG_FILES"); if (catalogs == NULL) { fprintf(stderr, "Variable $SGML_CATALOG_FILES not set\n"); } else { xmlLoadCatalogs(catalogs); } #endif #ifdef LIBXML_XINCLUDE_ENABLED } else if ((!strcmp(argv[i], "-xinclude")) || (!strcmp(argv[i], "--xinclude"))) { xinclude++; xsltSetXIncludeDefault(1); #endif } else if ((!strcmp(argv[i], "-param")) || (!strcmp(argv[i], "--param"))) { i++; params[nbparams++] = argv[i++]; params[nbparams++] = argv[i]; if (nbparams >= 16) { fprintf(stderr, "too many params\n"); return (1); } } else if ((!strcmp(argv[i], "-maxdepth")) || (!strcmp(argv[i], "--maxdepth"))) { int value; i++; if (sscanf(argv[i], "%d", &value) == 1) { if (value > 0) xsltMaxDepth = value; } } else if (!strcmp(argv[i], "--r")) { startRAutomatically = 1; RstartupScript = strchr(argv[i],'='); continue; } else { fprintf(stderr, "Unknown option %s\n", argv[i]); usage(argv[0]); return (1); } } params[nbparams] = NULL; /* * Replace entities with their content. */ xmlSubstituteEntitiesDefault(1); /* * Register the EXSLT extensions */ exsltRegisterAll(); registerRModule(0); if(startRAutomatically) { extern int RXSLT_internalSource(const char *fileName); int rargs = 1; const char *rargv[] = { "Sxsltproc" }; Rf_initEmbeddedR(rargs, rargv); loadXSLPackage(); if(RstartupScript && RstartupScript[0]) RXSLT_internalSource(RstartupScript); } for (i = 1; i < argc; i++) { if ((!strcmp(argv[i], "-maxdepth")) || (!strcmp(argv[i], "--maxdepth"))) { i++; continue; } else if ((!strcmp(argv[i], "-o")) || (!strcmp(argv[i], "-output")) || (!strcmp(argv[i], "--output"))) { i++; continue; } if ((!strcmp(argv[i], "-param")) || (!strcmp(argv[i], "--param"))) { i += 2; continue; } else if(!strcmp(argv[i], "--r")) { continue; } if ((argv[i][0] != '-') || (strcmp(argv[i], "-") == 0)) { if (timing) gettimeofday(&begin, NULL); style = xmlParseFile((const char *) argv[i]); if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; fprintf(stderr, "Parsing stylesheet %s took %ld ms\n", argv[i], msec); } if (style == NULL) { fprintf(stderr, "cannot parse %s\n", argv[i]); cur = NULL; } else { cur = xsltLoadStylesheetPI(style); if (cur != NULL) { /* it is an embedded stylesheet */ xsltProcess(style, cur, argv[i]); xsltFreeStylesheet(cur); exit(0); } cur = xsltParseStylesheetDoc(style); if (cur != NULL) { if (cur->indent == 1) xmlIndentTreeOutput = 1; else xmlIndentTreeOutput = 0; i++; } } break; } } /* * disable CDATA from being built in the document tree */ xmlDefaultSAXHandlerInit(); xmlDefaultSAXHandler.cdataBlock = NULL; if ((cur != NULL) && (cur->errors == 0)) { for (; i < argc; i++) { doc = NULL; if (timing) gettimeofday(&begin, NULL); #ifdef LIBXML_HTML_ENABLED if (html) doc = htmlParseFile(argv[i], NULL); else #endif #ifdef LIBXML_DOCB_ENABLED if (docbook) doc = docbParseFile(argv[i], NULL); else #endif doc = xmlParseFile(argv[i]); if (doc == NULL) { fprintf(stderr, "unable to parse %s\n", argv[i]); continue; } if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; fprintf(stderr, "Parsing document %s took %ld ms\n", argv[i], msec); } xsltProcess(doc, cur, argv[i]); } xsltFreeStylesheet(cur); } #ifdef CAN_UNREGISTER_MODULES xsltUnregisterAllExtModules(); #endif xmlCleanupParser(); xmlMemoryDump(); return (0); }
static void xsltProcess(xmlDocPtr doc, xsltStylesheetPtr cur, const char *filename) { xmlDocPtr res; #ifdef LIBXML_XINCLUDE_ENABLED if (xinclude) { if (timing) gettimeofday(&begin, NULL); xmlXIncludeProcess(doc); if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; fprintf(stderr, "XInclude processing %s took %ld ms\n", filename, msec); } } #endif if (timing) gettimeofday(&begin, NULL); if (output == NULL) { if (repeat) { int j; for (j = 1; j < repeat; j++) { res = xsltApplyStylesheet(cur, doc, params); xmlFreeDoc(res); xmlFreeDoc(doc); #ifdef LIBXML_HTML_ENABLED if (html) doc = htmlParseFile(filename, NULL); else #endif #ifdef LIBXML_DOCB_ENABLED if (docbook) doc = docbParseFile(filename, NULL); else #endif doc = xmlParseFile(filename); } } if (profile) { res = xsltProfileStylesheet(cur, doc, params, stderr); } else { res = xsltApplyStylesheet(cur, doc, params); } if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; if (repeat) fprintf(stderr, "Applying stylesheet %d times took %ld ms\n", repeat, msec); else fprintf(stderr, "Applying stylesheet took %ld ms\n", msec); } xmlFreeDoc(doc); if (res == NULL) { fprintf(stderr, "no result for %s\n", filename); return; } if (noout) { xmlFreeDoc(res); return; } #ifdef LIBXML_DEBUG_ENABLED if (debug) xmlDebugDumpDocument(stdout, res); else { #endif if (cur->methodURI == NULL) { if (timing) gettimeofday(&begin, NULL); xsltSaveResultToFile(stdout, res, cur); if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; fprintf(stderr, "Saving result took %ld ms\n", msec); } } else { if (xmlStrEqual (cur->method, (const xmlChar *) "xhtml")) { fprintf(stderr, "non standard output xhtml\n"); if (timing) gettimeofday(&begin, NULL); xsltSaveResultToFile(stdout, res, cur); if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; fprintf(stderr, "Saving result took %ld ms\n", msec); } } else { fprintf(stderr, "Unsupported non standard output %s\n", cur->method); } } #ifdef LIBXML_DEBUG_ENABLED } #endif xmlFreeDoc(res); } else { xsltRunStylesheet(cur, doc, params, output, NULL, NULL); if (timing) { long msec; gettimeofday(&end, NULL); msec = end.tv_sec - begin.tv_sec; msec *= 1000; msec += (end.tv_usec - begin.tv_usec) / 1000; fprintf(stderr, "Running stylesheet and saving result took %ld ms\n", msec); } xmlFreeDoc(doc); } }
int main(int argc, char** argv) { htmlDocPtr doc = NULL; xmlNodePtr root = NULL, form = NULL, input = NULL; size_t inputIndex = 0, formIndex = 0, i = 0; char tmpFileName[31] = "/tmp/form2post/"; const char *formAction, *formMethod, *inputName, *inputValue; char tmpChar; FILE* tmpFile; struct stat st = {0}; if(stat(tmpFileName, &st) == -1) { mkdir(tmpFileName, 0777); } // Generate random tmp file srand(time(NULL)); for(i = 15; i < 30; ++i) { tmpFileName[i] = rand() % 26 + 97; } // Write stdin to tmp file tmpFile = fopen(tmpFileName, "w+"); while((tmpChar = getchar()) != EOF) { fputc(tmpChar, tmpFile); } fclose(tmpFile); // Open HTML file in XML parser doc = htmlParseFile(tmpFileName, "UTF-8"); if(doc == NULL) { fprintf(stderr, "Failed to parse file\n"); return 1; } // Get XML root element root = xmlDocGetRootElement(doc); if(root == NULL) { fprintf(stderr, "This file contains no valid root element\n"); } // Loop through all "form" tags while((form = getTag(root, "form", formIndex++)) != NULL) { formAction = getAttribute(form, "action"); formMethod = getAttribute(form, "method"); if(formMethod != NULL) { char tmp[strlen(formMethod) + 1]; while((tmp[i] = toupper(formMethod[i]))) { ++i; } if(strncmp(tmp, "POST", 5)) { formMethod = "POST"; } else { formMethod = "GET"; } } else { formMethod = "GET"; } if(formAction == NULL) { formAction = ""; } printf("%s %s", formMethod, formAction); // Loop through each input of the current form inputIndex = 0; input = getTag(form, "input", inputIndex++); while(input != NULL) { inputName = getAttribute(input, "name"); inputValue = getAttribute(input, "value"); if(inputName != NULL) { if(inputValue == NULL) { inputValue = ""; } printf(inputIndex == 1 ? "?" : "&"); printf("%s=%s", inputName, inputValue); } input = getTag(form, "input", inputIndex++); } printf("\n"); } remove(tmpFileName); return 0; }