int main(int argc, char *argv[]) { char tag[1], attr[4], val[128]; CURL *curl; HTMLSTREAMPARSER *hsp; if (argc != 2) { printf("Usage: %s URL\n", argv[0]); return EXIT_FAILURE; } curl = curl_easy_init(); hsp = html_parser_init(); html_parser_set_tag_to_lower(hsp, 1); html_parser_set_attr_to_lower(hsp, 1); html_parser_set_tag_buffer(hsp, tag, sizeof(tag)); html_parser_set_attr_buffer(hsp, attr, sizeof(attr)); html_parser_set_val_buffer(hsp, val, sizeof(val)-1); curl_easy_setopt(curl, CURLOPT_URL, argv[1]); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, hsp); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_perform(curl); curl_easy_cleanup(curl); html_parser_cleanup(hsp); return EXIT_SUCCESS; }
char* GetLinksByTag(char* htmlcontent, char* url, char * tag_name, char * attribute) { // Buffers for the stream parser. char tag[8]; char attr[20]; char val[1024]; // Links string. char* links = malloc(1); size_t links_length = 0; size_t i; // Loop iterator char* base = malloc(strlen(url) + 1); strcpy(base, url); // Initialize the parser HTMLSTREAMPARSER *hsp = html_parser_init( ); html_parser_set_tag_to_lower(hsp, 1); html_parser_set_attr_to_lower(hsp, 1); html_parser_set_tag_buffer(hsp, tag, sizeof(tag)); html_parser_set_attr_buffer(hsp, attr, sizeof(attr)); html_parser_set_val_buffer(hsp, val, sizeof(val)-1); links[0] = '\0'; const size_t document_size = strlen(htmlcontent); const size_t tag_length = strlen(tag_name); const size_t attr_length = strlen(attribute); // Loop over document for (i = 0; i < document_size; i++) { html_parser_char_parse(hsp, htmlcontent[i]); // Detect different html base if (html_parser_cmp_tag(hsp, "base", 4) && html_parser_cmp_attr(hsp, "href", 4) && html_parser_is_in(hsp, HTML_VALUE_ENDED)) { html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; base = realloc(base, strlen(html_parser_val(hsp)) + 1); strcpy(base, html_parser_val(hsp)); } if (html_parser_cmp_tag(hsp, tag_name, tag_length)) { if (html_parser_cmp_attr(hsp, attribute, attr_length)) { if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) { html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; char* link = FormatLink(base, html_parser_val(hsp)); // Plus 1 byte for the newline size_t link_length = strlen(link) + 1; // Plus 1 byte for the null terminator links = realloc(links, links_length + link_length + 1); strcat(links, link); strcat(links, "\n"); links_length += link_length; free(link); } } } } /*release the hsp*/ html_parser_cleanup(hsp); free(base); return links; }