char* GetLinksByTag(char* htmlcontent, char* url, char * tag_name, char * attribute) { // Buffers for the stream parser. char tag[8]; char attr[20]; char val[1024]; // Links string. char* links = malloc(1); size_t links_length = 0; size_t i; // Loop iterator char* base = malloc(strlen(url) + 1); strcpy(base, url); // Initialize the parser HTMLSTREAMPARSER *hsp = html_parser_init( ); html_parser_set_tag_to_lower(hsp, 1); html_parser_set_attr_to_lower(hsp, 1); html_parser_set_tag_buffer(hsp, tag, sizeof(tag)); html_parser_set_attr_buffer(hsp, attr, sizeof(attr)); html_parser_set_val_buffer(hsp, val, sizeof(val)-1); links[0] = '\0'; const size_t document_size = strlen(htmlcontent); const size_t tag_length = strlen(tag_name); const size_t attr_length = strlen(attribute); // Loop over document for (i = 0; i < document_size; i++) { html_parser_char_parse(hsp, htmlcontent[i]); // Detect different html base if (html_parser_cmp_tag(hsp, "base", 4) && html_parser_cmp_attr(hsp, "href", 4) && html_parser_is_in(hsp, HTML_VALUE_ENDED)) { html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; base = realloc(base, strlen(html_parser_val(hsp)) + 1); strcpy(base, html_parser_val(hsp)); } if (html_parser_cmp_tag(hsp, tag_name, tag_length)) { if (html_parser_cmp_attr(hsp, attribute, attr_length)) { if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) { html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; char* link = FormatLink(base, html_parser_val(hsp)); // Plus 1 byte for the newline size_t link_length = strlen(link) + 1; // Plus 1 byte for the null terminator links = realloc(links, links_length + link_length + 1); strcat(links, link); strcat(links, "\n"); links_length += link_length; free(link); } } } } /*release the hsp*/ html_parser_cleanup(hsp); free(base); return links; }
static size_t write_callback(void *buffer, size_t size, size_t nmemb, void *hsp) { size_t realsize = size * nmemb, p; for (p = 0; p < realsize; p++) { html_parser_char_parse(hsp, ((char *)buffer)[p]); if (html_parser_cmp_tag(hsp, "a", 1)) if (html_parser_cmp_attr(hsp, "href", 4)) if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) { html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; printf("%s\n", html_parser_val(hsp)); } } return realsize; }