Ejemplo n.º 1
0
char* GetLinksByTag(char* htmlcontent, char* url, char * tag_name, char * attribute)
{
	// Buffers for the stream parser.
	char tag[8];
	char attr[20];
	char val[1024];
	// Links string.
	char* links = malloc(1);
	size_t links_length = 0;
	size_t i; // Loop iterator
	char* base = malloc(strlen(url) + 1);

	strcpy(base, url);

	// Initialize the parser
	HTMLSTREAMPARSER *hsp = html_parser_init( );
	html_parser_set_tag_to_lower(hsp, 1);
	html_parser_set_attr_to_lower(hsp, 1);
	html_parser_set_tag_buffer(hsp, tag, sizeof(tag));
	html_parser_set_attr_buffer(hsp, attr, sizeof(attr));
	html_parser_set_val_buffer(hsp, val, sizeof(val)-1);

	links[0] = '\0';

	const size_t document_size = strlen(htmlcontent);
	const size_t tag_length = strlen(tag_name);
	const size_t attr_length = strlen(attribute);

	// Loop over document
	for (i = 0; i < document_size; i++) {
		html_parser_char_parse(hsp, htmlcontent[i]);
		// Detect different html base
		if (html_parser_cmp_tag(hsp, "base", 4) && html_parser_cmp_attr(hsp, "href", 4) && html_parser_is_in(hsp, HTML_VALUE_ENDED)) {
			html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0';
			base = realloc(base, strlen(html_parser_val(hsp)) + 1);
			strcpy(base, html_parser_val(hsp));
		}
		if (html_parser_cmp_tag(hsp, tag_name, tag_length)) {
			if (html_parser_cmp_attr(hsp, attribute, attr_length)) {
				if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) {
					html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0';
					char* link = FormatLink(base, html_parser_val(hsp));
					// Plus 1 byte for the newline
					size_t link_length = strlen(link) + 1;
					// Plus 1 byte for the null terminator
					links = realloc(links, links_length + link_length + 1);
					strcat(links, link);
					strcat(links, "\n");
					links_length += link_length;
					free(link);
				}
			}
		}
	}

	/*release the hsp*/
	html_parser_cleanup(hsp);
	free(base);
	return links;
}
Ejemplo n.º 2
0
static size_t write_callback(void *buffer, size_t size, size_t nmemb,
                             void *hsp)
{
  size_t realsize = size * nmemb, p;
  for (p = 0; p < realsize; p++) {
    html_parser_char_parse(hsp, ((char *)buffer)[p]);
    if (html_parser_cmp_tag(hsp, "a", 1))
      if (html_parser_cmp_attr(hsp, "href", 4))
        if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) {
          html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0';
          printf("%s\n", html_parser_val(hsp));
        }
  }
  return realsize;
}