示例#1
0
void parseHTMLtoGetURL( const char* fileName )
{
	int i = 0;
	xmlDocPtr doc;
	xmlChar* xpath = (xmlChar *)"//*/a";
	xmlXPathObjectPtr  result;
	xmlChar* keyword;
	
	doc = htmlParseFile( fileName, "UTF-8" );

	if( doc == NULL ){
		fprintf( stderr, "Document not parsed successfully.\n" );
		return;
	}

	result = parsebyXPath( doc, xpath );

	if( result == NULL ){
		fprintf( stderr, "Not Found XPath:%s\n", (const char *)xpath);
		return;
	}

	for( i = 0; i < result->nodesetval->nodeNr; i++ ){
		keyword = xmlNodeListGetString(doc, result->nodesetval->nodeTab[i]->xmlChildrenNode, 1);
		printf("%s:", keyword );
		xmlFree(keyword);
		keyword = xmlGetProp( result->nodesetval->nodeTab[i], (const xmlChar *)"href" );
		printf("%s\n", keyword );
		xmlFree(keyword);
	}

	xmlXPathFreeObject(result);

	return;
}
示例#2
0
/**
 * Loads an url tree from a specified file.
 *
 * \param  filename  	name of file to read
 * \param  tree		empty tree which data will be read into
 * \return the file represented as a tree, or NULL on failure
 */
bool tree_urlfile_load(const char *filename, struct tree *tree,
		       tree_node_user_callback callback, void *callback_data)
{
	xmlDoc *doc;
	xmlNode *html, *body, *ul;
	struct node *root;
	FILE *fp = NULL;

	if (filename == NULL) {
		return false;
	}

	fp = fopen(filename, "r");
	if (fp == NULL) {
		return false;
	}
	fclose(fp);

	doc = htmlParseFile(filename, "iso-8859-1");
	if (doc == NULL) {
		warn_user("TreeLoadError", messages_get("ParsingFail"));
		return false;
	}

	html = tree_url_find_xml_element((xmlNode *) doc, "html");
	body = tree_url_find_xml_element(html, "body");
	ul = tree_url_find_xml_element(body, "ul");
	if (ul == NULL) {
		xmlFreeDoc(doc);
		warn_user("TreeLoadError",
			  "(<html>...<body>...<ul> not found.)");
		return false;
	}

	root = tree_get_root(tree);
	tree_url_load_directory(ul, tree, root, callback, callback_data);
	tree_set_node_expanded(tree, root, true, false, false);

	xmlFreeDoc(doc);
	return true;
}
示例#3
0
文件: htmlp.c 项目: miwoow/bscan
void htmlp_get_link(spider_t *sp, const char *fname, const char *encoding)
{
  htmlDocPtr docp;
  xmlXPathContextPtr context;
  int i=0, j=0;
  xmlXPathObjectPtr result;
  xmlNodeSetPtr nodeset;
  struct _xmlAttr *attrs;
  char *url;
  char complete_url[1024] = {0};
  url_queue_t *uqueue = sp->urlq;
  char *pos = NULL;
  char *cur_chr = NULL;

  char *url_tmp_buf = NULL;
  /*
  printf("*******************************\n");
  printf("%s\n", sp->cur_url);
  printf("*******************************\n");
  */

  docp = htmlParseFile(fname, encoding);
  context = xmlXPathNewContext(docp);
  result = xmlXPathEvalExpression("//a", context);
  xmlXPathFreeContext(context);
  if(!xmlXPathNodeSetIsEmpty(result->nodesetval)) {
    nodeset = result->nodesetval;
    for(i=0; i < nodeset->nodeNr; i++) {
      attrs = nodeset->nodeTab[i]->properties;
      while(attrs) {
	if (strncasecmp(attrs->name, "href", 4) == 0) {
	  url = attrs->children->content;
	  url_tmp_buf = calloc(1, strlen(url) + 1);
	  struct struct_parts uobj;
	  memcpy(url_tmp_buf, url, strlen(url));
	  printf("===================\n");
	  printf("url:%s\n", url);
	  urlp_parse(url_tmp_buf, &uobj);
	  printf("access: %s\n", uobj.access);
	  printf("host:%s\n", uobj.host);
	  printf("absolute:%s\n", uobj.absolute);
	  printf("relative:%s\n", uobj.relative);
	  printf("search:%s\n", uobj.search);
	  printf("anchor:%s\n", uobj.anchor);
	  printf("===================\n");
	  if (uobj.access != NULL) {
	    if (strcasecmp(uobj.access, "javascript") != 0) {
	      if (uobj.host != NULL) {
		if (memcmp(uobj.host, (sp->root_url_obj).host, strlen(uobj.host)) == 0) {
		  url_queue_uniq_add(uqueue, url);
		}
	      } else {
		url_queue_uniq_add(uqueue, url);
	      }
	    }
	  } else {
	    if (uobj.access == NULL)
	      uobj.access = (sp->root_url_obj).access;
	    if (uobj.host == NULL)
	      uobj.host = (sp->root_url_obj).host;
	    if (uobj.relative || uobj.absolute) {
	      memset(complete_url, 0, 1024);
	      if (uobj.relative) {
		pos = strrchr(sp->cur_url, '/');
		if (*(pos-1) == '/' && *(pos-2) == ':') {
		  snprintf(complete_url, 1023, "%s://%s",
			   uobj.access,
			   uobj.host);
		  complete_url[strlen(complete_url)] = '/';
		  strncat(complete_url, uobj.relative, 1023);
		} else {
		  for (j=0, cur_chr = sp->cur_url; cur_chr < pos + 1; cur_chr++, j++) {
		    complete_url[j] = *cur_chr;
		  }
		  strncat(complete_url, uobj.relative, 1023);
		}
	      }
	      
	      if (uobj.absolute) {
		snprintf(complete_url, 1023, "%s://%s",
			 uobj.access,
			 uobj.host);
		complete_url[strlen(complete_url)] = '/';
		strncat(complete_url, uobj.absolute, 1023);
	      }
	    } else {
	      memset(complete_url, 0, 1024);
	      snprintf(complete_url, 1023, "%s://%s",
		       uobj.access,
		       uobj.host);
	      complete_url[strlen(complete_url)] = '/'; 
	    }
	    if (uobj.search) {
	      complete_url[strlen(complete_url)] = '?';
	      strncat(complete_url, uobj.search, 1023);
	    }
	    if (uobj.anchor) {
	      complete_url[strlen(complete_url)] = '#';
	      strncat(complete_url, uobj.anchor, 1023);
	    }
	    printf("******preurl: %s\n", complete_url);
	    urlp_strip_dot(complete_url);
	    printf("******passurl: %s\n", complete_url);
	    url_queue_uniq_add(uqueue, complete_url);
	  }
	  
	  free(url_tmp_buf);
	  break;
	}
	attrs = attrs->next;
      }
    }
  }
  xmlXPathFreeObject (result);
  xmlCleanupParser();
}
示例#4
0
文件: xsltproc.c 项目: way2joy/Sxslt
int
main(int argc, char **argv)
{
    int i;
    xsltStylesheetPtr cur = NULL;
    xmlDocPtr doc, style;

    if (argc <= 1) {
        usage(argv[0]);
        return (1);
    }

    xmlInitMemory();

    LIBXML_TEST_VERSION

    defaultLoader = xmlGetExternalEntityLoader();
    xmlLineNumbersDefault(1);

    if (novalid == 0)           /* TODO XML_DETECT_IDS | XML_COMPLETE_ATTRS */
        xmlLoadExtDtdDefaultValue = 6;
    else
        xmlLoadExtDtdDefaultValue = 0;
    for (i = 1; i < argc; i++) {
        if (!strcmp(argv[i], "-"))
            break;

        if (argv[i][0] != '-')
            continue;
#ifdef LIBXML_DEBUG_ENABLED
        if ((!strcmp(argv[i], "-debug")) || (!strcmp(argv[i], "--debug"))) {
            debug++;
        } else
#endif
        if ((!strcmp(argv[i], "-v")) ||
                (!strcmp(argv[i], "-verbose")) ||
                (!strcmp(argv[i], "--verbose"))) {
            xsltSetGenericDebugFunc(stderr, NULL);
        } else if ((!strcmp(argv[i], "-o")) ||
                   (!strcmp(argv[i], "-output")) ||
                   (!strcmp(argv[i], "--output"))) {
            i++;
            output = argv[i++];
        } else if ((!strcmp(argv[i], "-V")) ||
                   (!strcmp(argv[i], "-version")) ||
                   (!strcmp(argv[i], "--version"))) {
            printf("Using libxml %s, libxslt %s and libexslt %s\n",
                   xmlParserVersion, xsltEngineVersion, exsltLibraryVersion);
            printf
    ("xsltproc was compiled against libxml %d, libxslt %d and libexslt %d\n",
                 LIBXML_VERSION, LIBXSLT_VERSION, LIBEXSLT_VERSION);
            printf("libxslt %d was compiled against libxml %d\n",
                   xsltLibxsltVersion, xsltLibxmlVersion);
            printf("libexslt %d was compiled against libxml %d\n",
                   exsltLibexsltVersion, exsltLibxmlVersion);
        } else if ((!strcmp(argv[i], "-repeat"))
                   || (!strcmp(argv[i], "--repeat"))) {
            if (repeat == 0)
                repeat = 20;
            else
                repeat = 100;
        } else if ((!strcmp(argv[i], "-novalid")) ||
                   (!strcmp(argv[i], "--novalid"))) {
            novalid++;
        } else if ((!strcmp(argv[i], "-noout")) ||
                   (!strcmp(argv[i], "--noout"))) {
            noout++;
#ifdef LIBXML_DOCB_ENABLED
        } else if ((!strcmp(argv[i], "-docbook")) ||
                   (!strcmp(argv[i], "--docbook"))) {
            docbook++;
#endif
#ifdef LIBXML_HTML_ENABLED
        } else if ((!strcmp(argv[i], "-html")) ||
                   (!strcmp(argv[i], "--html"))) {
            html++;
#endif
        } else if ((!strcmp(argv[i], "-timing")) ||
                   (!strcmp(argv[i], "--timing"))) {
            timing++;
        } else if ((!strcmp(argv[i], "-profile")) ||
                   (!strcmp(argv[i], "--profile"))) {
            profile++;
        } else if ((!strcmp(argv[i], "-norman")) ||
                   (!strcmp(argv[i], "--norman"))) {
            profile++;
        } else if ((!strcmp(argv[i], "-warnnet")) ||
                   (!strcmp(argv[i], "--warnnet"))) {
            xmlSetExternalEntityLoader(xsltNoNetExternalEntityLoader);
        } else if ((!strcmp(argv[i], "-nonet")) ||
                   (!strcmp(argv[i], "--nonet"))) {
            xmlSetExternalEntityLoader(xsltNoNetExternalEntityLoader);
            nonet = 1;
#ifdef LIBXML_CATALOG_ENABLED
        } else if ((!strcmp(argv[i], "-catalogs")) ||
                   (!strcmp(argv[i], "--catalogs"))) {
            const char *catalogs;

            catalogs = getenv("SGML_CATALOG_FILES");
            if (catalogs == NULL) {
                fprintf(stderr, "Variable $SGML_CATALOG_FILES not set\n");
            } else {
                xmlLoadCatalogs(catalogs);
            }
#endif
#ifdef LIBXML_XINCLUDE_ENABLED
        } else if ((!strcmp(argv[i], "-xinclude")) ||
                   (!strcmp(argv[i], "--xinclude"))) {
            xinclude++;
            xsltSetXIncludeDefault(1);
#endif
        } else if ((!strcmp(argv[i], "-param")) ||
                   (!strcmp(argv[i], "--param"))) {
            i++;
            params[nbparams++] = argv[i++];
            params[nbparams++] = argv[i];
            if (nbparams >= 16) {
                fprintf(stderr, "too many params\n");
                return (1);
            }
        } else if ((!strcmp(argv[i], "-maxdepth")) ||
                   (!strcmp(argv[i], "--maxdepth"))) {
            int value;

            i++;
            if (sscanf(argv[i], "%d", &value) == 1) {
                if (value > 0)
                    xsltMaxDepth = value;
            }
        } else if (!strcmp(argv[i], "--r")) {
	       startRAutomatically = 1;
               RstartupScript = strchr(argv[i],'=');
               continue;
        } else {
            fprintf(stderr, "Unknown option %s\n", argv[i]);
            usage(argv[0]);
            return (1);
        }
    }
    params[nbparams] = NULL;

    /*
     * Replace entities with their content.
     */
    xmlSubstituteEntitiesDefault(1);

    /*
     * Register the EXSLT extensions
     */
    exsltRegisterAll();
    registerRModule(0);
    if(startRAutomatically) {
        extern int RXSLT_internalSource(const char *fileName);
	int rargs = 1;
        const char *rargv[] = { "Sxsltproc" };
	Rf_initEmbeddedR(rargs, rargv);
	loadXSLPackage();           
        if(RstartupScript && RstartupScript[0])
	    RXSLT_internalSource(RstartupScript);
    }

    for (i = 1; i < argc; i++) {
        if ((!strcmp(argv[i], "-maxdepth")) ||
            (!strcmp(argv[i], "--maxdepth"))) {
            i++;
            continue;
        } else if ((!strcmp(argv[i], "-o")) ||
                   (!strcmp(argv[i], "-output")) ||
                   (!strcmp(argv[i], "--output"))) {
            i++;
	    continue;
	}
        if ((!strcmp(argv[i], "-param")) || (!strcmp(argv[i], "--param"))) {
            i += 2;
            continue;
        } else if(!strcmp(argv[i], "--r")) {
            continue;
	}
           
        if ((argv[i][0] != '-') || (strcmp(argv[i], "-") == 0)) {
            if (timing)
                gettimeofday(&begin, NULL);
	    style = xmlParseFile((const char *) argv[i]);
            if (timing) {
                long msec;

                gettimeofday(&end, NULL);
                msec = end.tv_sec - begin.tv_sec;
                msec *= 1000;
                msec += (end.tv_usec - begin.tv_usec) / 1000;
                fprintf(stderr, "Parsing stylesheet %s took %ld ms\n",
                        argv[i], msec);
            }
	    if (style == NULL) {
		fprintf(stderr,  "cannot parse %s\n", argv[i]);
		cur = NULL;
	    } else {
		cur = xsltLoadStylesheetPI(style);
		if (cur != NULL) {
		    /* it is an embedded stylesheet */
		    xsltProcess(style, cur, argv[i]);
		    xsltFreeStylesheet(cur);
		    exit(0);
		}
		cur = xsltParseStylesheetDoc(style);
		if (cur != NULL) {
		    if (cur->indent == 1)
			xmlIndentTreeOutput = 1;
		    else
			xmlIndentTreeOutput = 0;
		    i++;
		}
	    }
            break;

        }
    }

    /*
     * disable CDATA from being built in the document tree
     */
    xmlDefaultSAXHandlerInit();
    xmlDefaultSAXHandler.cdataBlock = NULL;

    if ((cur != NULL) && (cur->errors == 0)) {
        for (; i < argc; i++) {
	    doc = NULL;
            if (timing)
                gettimeofday(&begin, NULL);
#ifdef LIBXML_HTML_ENABLED
            if (html)
                doc = htmlParseFile(argv[i], NULL);
            else
#endif
#ifdef LIBXML_DOCB_ENABLED
            if (docbook)
                doc = docbParseFile(argv[i], NULL);
            else
#endif
                doc = xmlParseFile(argv[i]);
            if (doc == NULL) {
                fprintf(stderr, "unable to parse %s\n", argv[i]);
                continue;
            }
            if (timing) {
                long msec;

                gettimeofday(&end, NULL);
                msec = end.tv_sec - begin.tv_sec;
                msec *= 1000;
                msec += (end.tv_usec - begin.tv_usec) / 1000;
                fprintf(stderr, "Parsing document %s took %ld ms\n",
                        argv[i], msec);
            }
	    xsltProcess(doc, cur, argv[i]);
        }
        xsltFreeStylesheet(cur);
    }
#ifdef CAN_UNREGISTER_MODULES
    xsltUnregisterAllExtModules();
#endif
    xmlCleanupParser();
    xmlMemoryDump();
    return (0);
}
示例#5
0
文件: xsltproc.c 项目: way2joy/Sxslt
static void
xsltProcess(xmlDocPtr doc, xsltStylesheetPtr cur, const char *filename) {
    xmlDocPtr res;

#ifdef LIBXML_XINCLUDE_ENABLED
    if (xinclude) {
	if (timing)
	    gettimeofday(&begin, NULL);
	xmlXIncludeProcess(doc);
	if (timing) {
	    long msec;

	    gettimeofday(&end, NULL);
	    msec = end.tv_sec - begin.tv_sec;
	    msec *= 1000;
	    msec += (end.tv_usec - begin.tv_usec) / 1000;
	    fprintf(stderr, "XInclude processing %s took %ld ms\n",
		    filename, msec);
	}
    }
#endif
    if (timing)
	gettimeofday(&begin, NULL);
    if (output == NULL) {
	if (repeat) {
	    int j;

	    for (j = 1; j < repeat; j++) {
		res = xsltApplyStylesheet(cur, doc, params);
		xmlFreeDoc(res);
		xmlFreeDoc(doc);
#ifdef LIBXML_HTML_ENABLED
		if (html)
		    doc = htmlParseFile(filename, NULL);
		else
#endif
#ifdef LIBXML_DOCB_ENABLED
		if (docbook)
		    doc = docbParseFile(filename, NULL);
		else
#endif
		    doc = xmlParseFile(filename);
	    }
	}
	if (profile) {
	    res = xsltProfileStylesheet(cur, doc, params, stderr);
	} else {
	    res = xsltApplyStylesheet(cur, doc, params);
	}
	if (timing) {
	    long msec;

	    gettimeofday(&end, NULL);
	    msec = end.tv_sec - begin.tv_sec;
	    msec *= 1000;
	    msec += (end.tv_usec - begin.tv_usec) / 1000;
	    if (repeat)
		fprintf(stderr,
			"Applying stylesheet %d times took %ld ms\n",
			repeat, msec);
	    else
		fprintf(stderr,
			"Applying stylesheet took %ld ms\n", msec);
	}
	xmlFreeDoc(doc);
	if (res == NULL) {
	    fprintf(stderr, "no result for %s\n", filename);
	    return;
	}
	if (noout) {
	    xmlFreeDoc(res);
	    return;
	}
#ifdef LIBXML_DEBUG_ENABLED
	if (debug)
	    xmlDebugDumpDocument(stdout, res);
	else {
#endif
	    if (cur->methodURI == NULL) {
		if (timing)
		    gettimeofday(&begin, NULL);
		xsltSaveResultToFile(stdout, res, cur);
		if (timing) {
		    long msec;

		    gettimeofday(&end, NULL);
		    msec = end.tv_sec - begin.tv_sec;
		    msec *= 1000;
		    msec += (end.tv_usec - begin.tv_usec) / 1000;
		    fprintf(stderr, "Saving result took %ld ms\n",
			    msec);
		}
	    } else {
		if (xmlStrEqual
		    (cur->method, (const xmlChar *) "xhtml")) {
		    fprintf(stderr, "non standard output xhtml\n");
		    if (timing)
			gettimeofday(&begin, NULL);
		    xsltSaveResultToFile(stdout, res, cur);
		    if (timing) {
			long msec;

			gettimeofday(&end, NULL);
			msec = end.tv_sec - begin.tv_sec;
			msec *= 1000;
			msec +=
			    (end.tv_usec - begin.tv_usec) / 1000;
			fprintf(stderr,
				"Saving result took %ld ms\n",
				msec);
		    }
		} else {
		    fprintf(stderr,
			    "Unsupported non standard output %s\n",
			    cur->method);
		}
	    }
#ifdef LIBXML_DEBUG_ENABLED
	}
#endif

	xmlFreeDoc(res);
    } else {
	xsltRunStylesheet(cur, doc, params, output, NULL, NULL);
	if (timing) {
	    long msec;

	    gettimeofday(&end, NULL);
	    msec = end.tv_sec - begin.tv_sec;
	    msec *= 1000;
	    msec += (end.tv_usec - begin.tv_usec) / 1000;
	    fprintf(stderr,
		"Running stylesheet and saving result took %ld ms\n",
		    msec);
	}
	xmlFreeDoc(doc);
    }
}
示例#6
0
int main(int argc, char** argv)
{
	htmlDocPtr doc = NULL;
	xmlNodePtr root = NULL, form = NULL, input = NULL;
	size_t inputIndex = 0, formIndex = 0, i = 0;
	char tmpFileName[31] = "/tmp/form2post/";
	const char *formAction, *formMethod, *inputName, *inputValue;
	char tmpChar;
	FILE* tmpFile;

	struct stat st = {0};

	if(stat(tmpFileName, &st) == -1)
	{
		mkdir(tmpFileName, 0777);
	}

	// Generate random tmp file
	srand(time(NULL));
	for(i = 15; i < 30; ++i)
	{
		tmpFileName[i] = rand() % 26 + 97;
	}

	// Write stdin to tmp file
	tmpFile = fopen(tmpFileName, "w+");
	while((tmpChar = getchar()) != EOF)
	{
		fputc(tmpChar, tmpFile);
	}
	fclose(tmpFile);

	// Open HTML file in XML parser
	doc = htmlParseFile(tmpFileName, "UTF-8");
	if(doc == NULL)
	{
		fprintf(stderr, "Failed to parse file\n");
		return 1;
	}

	// Get XML root element
	root = xmlDocGetRootElement(doc);
	if(root == NULL)
	{
		fprintf(stderr, "This file contains no valid root element\n");
	}

	// Loop through all "form" tags
	while((form = getTag(root, "form", formIndex++)) != NULL)
	{
		formAction = getAttribute(form, "action");
		formMethod = getAttribute(form, "method");
		if(formMethod != NULL)
		{
			char tmp[strlen(formMethod) + 1];
			while((tmp[i] = toupper(formMethod[i]))) { ++i; }
			if(strncmp(tmp, "POST", 5))
			{
				formMethod = "POST";
			}
			else
			{
				formMethod = "GET";
			}
		}
		else
		{
			formMethod = "GET";
		}

		if(formAction == NULL)
		{
			formAction = "";
		}
		printf("%s %s", formMethod, formAction);

		// Loop through each input of the current form
		inputIndex = 0;
		input = getTag(form, "input", inputIndex++);
		while(input != NULL)
		{
			inputName = getAttribute(input, "name");
			inputValue = getAttribute(input, "value");

			if(inputName != NULL)
			{
				if(inputValue == NULL)
				{
					inputValue = "";
				}
				printf(inputIndex == 1 ? "?" : "&");
				printf("%s=%s", inputName, inputValue);
			}
			input = getTag(form, "input", inputIndex++);
		}
		printf("\n");
	}
	remove(tmpFileName);
	return 0;
}