const std::vector<CMyHtmlParser::HtmlTag>& CMyHtmlParser::parse(const QByteArray& html) { _tags.clear(); _tree = myhtml_tree_create(); assert_r(_tree); assert_r(myhtml_tree_init(_tree, _myhtmlInstance) == MyCORE_STATUS_OK); myhtml_callback_tree_node_insert_set(_tree, &CMyHtmlParser::callbackNodeInserted, this); /* * From Specification: * * The authoring conformance requirements for character encoding declarations limit them to only * appearing in the first 1024 bytes. User agents are therefore encouraged to use the prescan * algorithm below (as invoked by these steps) on the first 1024 bytes, but not to stall beyond that. */ _encoding = myencoding_prescan_stream_to_determine_encoding(html.data(), std::min(html.size(), 1024)); if (_encoding == MyENCODING_NOT_DETERMINED) { assert_unconditional_r("Failed to determine data encoding"); _encoding = MyENCODING_UTF_8; } assert_r(myhtml_parse(_tree, _encoding, html.data(), html.size()) == MyCORE_STATUS_OK); myhtml_tree_destroy(_tree); return _tags; }
void test_all(void) { myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); DIR *dir; struct dirent *ent; struct stat path_stat; const char *from_dir = "/new/Test/out/"; size_t from_dir_len = strlen(from_dir); char path[4096]; strncpy(path, from_dir, from_dir_len); size_t count = 0; if((dir = opendir(from_dir)) != NULL) { while((ent = readdir(dir)) != NULL) { sprintf(&path[from_dir_len], "%s", ent->d_name); stat(path, &path_stat); if(ent->d_name[0] != '.' && !S_ISDIR(path_stat.st_mode)) { count++; // printf("%zu: %s\n", count, path); struct res_html res = load_html(path); myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size); // myhtml_tree_node_t **node_list = myhtml_get_elements_by_tag_id(tree, MyHTML_TAG_TITLE, NULL); // // if(node_list && node_list[0]) // if(node_list[0]->token) // myhtml_tree_print_by_tree_idx(tree, node_list[0]->child, stdout, 0); // // myhtml_destroy_node_list(node_list); free(res.html); } } closedir (dir); } myhtml_tree_destroy(tree); myhtml_destroy(myhtml); }
int main(int argc, const char * argv[]) { const char* path; if (argc == 2) { path = argv[1]; } else { printf("Bad ARGV!\nUse: get_title_high_level <path_to_html_file>\n"); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // parse html myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size); // parse html myhtml_collection_t *collection = myhtml_get_nodes_by_tag_id(tree, NULL, MyHTML_TAG_TITLE, NULL); if(collection && collection->list && collection->length) { myhtml_tree_node_t *text_node = myhtml_node_child(collection->list[0]); if(text_node) { const char* text = myhtml_node_text(text_node, NULL); if(text) printf("Title: %s\n", text); } } // release resources myhtml_collection_destroy(collection); myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); return 0; }
void chunk_test(void) { struct res_html res = load_html("/new/C-git/myhtml/test/html/chunk.data"); //struct res_html res = load_html("/new/C-git/myhtml/test/broken.html"); myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); myhtml_encoding_set(tree, MyHTML_ENCODING_UTF_8); size_t begin = 0, i = 0; while (i < res.size) { if(res.html[i] == '\n') { //printf("Parse chunk: %.*s\n", (int)(i - begin), &res.html[begin]); myhtml_parse_chunk(tree, &res.html[begin], (i - begin)); myhtml_tokenizer_wait(tree); begin = i + 1; } i++; } myhtml_parse_chunk(tree, &res.html[begin], (i - begin)); myhtml_parse_chunk_end(tree); i = 0; while (i < res.size) { if(res.html[i] != '\n') { printf("%c", res.html[i]); } i++; } printf("\n"); myhtml_tree_print_node_childs(tree, tree->document, stdout, 0); myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); }
int main(int argc, const char * argv[]) { const char* path; if (argc == 2) { path = argv[1]; } else { printf("Bad ARGV!\nUse: get_title_low_level <path_to_html_file>\n"); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // parse html myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size); // get title from index myhtml_collection_t *titles_list = myhtml_get_nodes_by_tag_id(tree, NULL, MyHTML_TAG_TITLE, NULL); if(titles_list && titles_list->length != 0 && titles_list->list[0]->child) { mycore_string_raw_t str = {0}; myhtml_serialization_node(titles_list->list[0]->child, &str); printf("%s\n", str.data); mycore_string_raw_destroy(&str, false); } // release resources myhtml_collection_destroy(titles_list); myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); return 0; }
int main(int argc, const char * argv[]) { const char* path; if (argc == 2) { path = argv[1]; } else { printf("Bad ARGV!\nUse: serialization_high_level <path_to_html_file>\n"); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // parse html myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size); mycore_string_raw_t str_raw; mycore_string_raw_clean_all(&str_raw); if(myhtml_serialization_tree_buffer(myhtml_tree_get_document(tree), &str_raw)) { fprintf(stderr, "Could not serialization for the tree\n"); exit(EXIT_FAILURE); } printf("%s", str_raw.data); mycore_string_raw_destroy(&str_raw, false); // release resources myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); return 0; }
int main(int argc, const char * argv[]) { const char* path; const char* attr_key; if (argc == 3) { attr_key = argv[1]; path = argv[2]; } else { printf("Bad ARGV!\nUse: nodes_by_attr_key_high_level <attribute key> <path to html file>\n"); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // parse html myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size); // get and print myhtml_collection_t *collection = myhtml_get_nodes_by_attribute_key(tree, NULL, NULL, attr_key, strlen(attr_key), NULL); for(size_t i = 0; i < collection->length; i++) myhtml_tree_print_node(tree, collection->list[i], stdout); printf("Total found: %zu\n", collection->length); myhtml_collection_destroy(collection); // release resources myhtml_tree_destroy(tree); myhtml_destroy(myhtml); return 0; }
int main(int argc, const char * argv[]) { const char* path; if (argc == 2) { path = argv[1]; } else { printf("Bad ARGV!\nUse: callback_tree_node_high_level <path_to_html_file>\n"); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // set callbacks myhtml_callback_tree_node_insert_set(tree, callback_node_insert, NULL); myhtml_callback_tree_node_remove_set(tree, callback_node_remove, NULL); // parse html myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size); // release resources myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); return 0; }
int main(int argc, const char * argv[]) { char html[] = "<div><span>Best of Fragments</span><a>click to make happy</a></div>"; // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // parse html myhtml_parse_fragment(tree, MyHTML_ENCODING_UTF_8, html, strlen(html), MyHTML_TAG_DIV, MyHTML_NAMESPACE_HTML); // print fragment myhtml_tree_print_node_children(tree, myhtml_tree_get_document(tree), stdout, 0); // release resources myhtml_tree_destroy(tree); myhtml_destroy(myhtml); return 0; }
int main(int argc, const char * argv[]) { // chunk_test(); // return 0; // //myhtml_encoding_decode_single_byte(0xf4); // myhtml_encoding_result_t res_data = {0, 0, 0, 0, 0, 0}; // // unsigned char data[20]; // data[0] = 0xAC; // data[1] = 0xB1; // data[2] = 0x42; // data[3] = 0x4F; // data[4] = 0x51; // data[5] = 0x1B; // data[6] = 0x28; // data[7] = 0x42; // data[8] = 0; // // //myhtml_encoding_dec_to_char(54936, data); // // enum myhtml_encoding_status status = MyHTML_ENCODING_STATUS_CONTINUE; // // size_t i = 0; // while (status & MyHTML_ENCODING_STATUS_CONTINUE) { // status = myhtml_encoding_decode_euc_kr(data[i], &res_data); // i++; // } // // myhtml_string_convert_dec_to_ansi_utf8(res_data.result, (char *)data); // // return 0; // chunk_test(); // uint64_t all_start1 = myhtml_hperf_clock(NULL); // test_all(); // uint64_t all_stop1 = myhtml_hperf_clock(NULL); // // myhtml_hperf_print("Parse html", all_start1, all_stop1, stdout); // return 0; /* Default path or argument value */ //const char* path = "/new/C-git/myhtml/test/test.html"; //const char* path = "/new/C-git/broken.html"; const char* path = "/new/C-git/test_full.html"; if (argc == 2) { path = argv[1]; } setbuf(stdout, 0); myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); struct res_html res = load_html(path); uint64_t all_start = myhtml_hperf_clock(NULL); uint64_t tree_init_start = myhtml_hperf_clock(NULL); // init once for N html myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); uint64_t tree_init_stop = myhtml_hperf_clock(NULL); uint64_t parse_start = myhtml_hperf_clock(NULL); myhtml_encoding_t encoding; myhtml_encoding_detect(res.html, res.size, &encoding); for(size_t i = 0; i < 1; i++) { //myhtml_parse(tree, text, strlen(text)); myhtml_parse_single(tree, encoding, res.html, res.size); // myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size); // myhtml_tree_print_node_childs(tree, tree->document, stdout, 0); } //usleep(100000000); uint64_t parse_stop = myhtml_hperf_clock(NULL); uint64_t all_stop = myhtml_hperf_clock(NULL); printf("\n\nInformation:\n"); printf("Timer (%llu ticks/sec):\n", (unsigned long long) myhtml_hperf_res(NULL)); myhtml_hperf_print("\tFirst Tree init", tree_init_start, tree_init_stop, stdout); myhtml_hperf_print("\tParse html", parse_start, parse_stop, stdout); myhtml_hperf_print("\tTotal", all_start, all_stop, stdout); printf("\n"); myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); return 0; }
int main(int argc, const char * argv[]) { const char* path; const char* attr_value; struct res_argv rargv; if(argc > 2) { path = argv[1]; attr_value = argv[2]; rargv = get_argv(3, argc, argv); } else { print_usage(); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // parse html myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size); // get and print myhtml_collection_t* collection = NULL; switch (rargv.search_type) { case 0: collection = myhtml_get_nodes_by_attribute_value(tree, NULL, NULL, rargv.is_insensitive, rargv.key, rargv.key_length, attr_value, strlen(attr_value), NULL); break; case 1: collection = myhtml_get_nodes_by_attribute_value_whitespace_separated(tree, NULL, NULL, rargv.is_insensitive, rargv.key, rargv.key_length, attr_value, strlen(attr_value), NULL); break; case 2: collection = myhtml_get_nodes_by_attribute_value_begin(tree, NULL, NULL, rargv.is_insensitive, rargv.key, rargv.key_length, attr_value, strlen(attr_value), NULL); break; case 3: collection = myhtml_get_nodes_by_attribute_value_end(tree, NULL, NULL, rargv.is_insensitive, rargv.key, rargv.key_length, attr_value, strlen(attr_value), NULL); break; case 4: collection = myhtml_get_nodes_by_attribute_value_contain(tree, NULL, NULL, rargv.is_insensitive, rargv.key, rargv.key_length, attr_value, strlen(attr_value), NULL); break; case 5: collection = myhtml_get_nodes_by_attribute_value_hyphen_separated(tree, NULL, NULL, rargv.is_insensitive, rargv.key, rargv.key_length, attr_value, strlen(attr_value), NULL); break; default: print_usage(); exit(EXIT_FAILURE); } if(collection) { for(size_t i = 0; i < collection->length; i++) myhtml_tree_print_node(tree, collection->list[i], stdout); printf("Total found: %zu\n", collection->length); } myhtml_collection_destroy(collection); // release resources myhtml_tree_destroy(tree); myhtml_destroy(myhtml); return 0; }