示例#1
0
int main(int argc, char *argv[])
{
  char tag[1], attr[4], val[128];
  CURL *curl;
  HTMLSTREAMPARSER *hsp;

  if (argc != 2) {
    printf("Usage: %s URL\n", argv[0]);
    return EXIT_FAILURE;
  }

  curl = curl_easy_init();

  hsp = html_parser_init();

  html_parser_set_tag_to_lower(hsp, 1);
  html_parser_set_attr_to_lower(hsp, 1);
  html_parser_set_tag_buffer(hsp, tag, sizeof(tag));
  html_parser_set_attr_buffer(hsp, attr, sizeof(attr));
  html_parser_set_val_buffer(hsp, val, sizeof(val)-1);

  curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, hsp);
  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);

  curl_easy_perform(curl);

  curl_easy_cleanup(curl);

  html_parser_cleanup(hsp);

  return EXIT_SUCCESS;
}
示例#2
0
char* GetLinksByTag(char* htmlcontent, char* url, char * tag_name, char * attribute)
{
	// Buffers for the stream parser.
	char tag[8];
	char attr[20];
	char val[1024];
	// Links string.
	char* links = malloc(1);
	size_t links_length = 0;
	size_t i; // Loop iterator
	char* base = malloc(strlen(url) + 1);

	strcpy(base, url);

	// Initialize the parser
	HTMLSTREAMPARSER *hsp = html_parser_init( );
	html_parser_set_tag_to_lower(hsp, 1);
	html_parser_set_attr_to_lower(hsp, 1);
	html_parser_set_tag_buffer(hsp, tag, sizeof(tag));
	html_parser_set_attr_buffer(hsp, attr, sizeof(attr));
	html_parser_set_val_buffer(hsp, val, sizeof(val)-1);

	links[0] = '\0';

	const size_t document_size = strlen(htmlcontent);
	const size_t tag_length = strlen(tag_name);
	const size_t attr_length = strlen(attribute);

	// Loop over document
	for (i = 0; i < document_size; i++) {
		html_parser_char_parse(hsp, htmlcontent[i]);
		// Detect different html base
		if (html_parser_cmp_tag(hsp, "base", 4) && html_parser_cmp_attr(hsp, "href", 4) && html_parser_is_in(hsp, HTML_VALUE_ENDED)) {
			html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0';
			base = realloc(base, strlen(html_parser_val(hsp)) + 1);
			strcpy(base, html_parser_val(hsp));
		}
		if (html_parser_cmp_tag(hsp, tag_name, tag_length)) {
			if (html_parser_cmp_attr(hsp, attribute, attr_length)) {
				if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) {
					html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0';
					char* link = FormatLink(base, html_parser_val(hsp));
					// Plus 1 byte for the newline
					size_t link_length = strlen(link) + 1;
					// Plus 1 byte for the null terminator
					links = realloc(links, links_length + link_length + 1);
					strcat(links, link);
					strcat(links, "\n");
					links_length += link_length;
					free(link);
				}
			}
		}
	}

	/*release the hsp*/
	html_parser_cleanup(hsp);
	free(base);
	return links;
}