const std::vector<CMyHtmlParser::HtmlTag>& CMyHtmlParser::parse(const QByteArray& html) { _tags.clear(); _tree = myhtml_tree_create(); assert_r(_tree); assert_r(myhtml_tree_init(_tree, _myhtmlInstance) == MyCORE_STATUS_OK); myhtml_callback_tree_node_insert_set(_tree, &CMyHtmlParser::callbackNodeInserted, this); /* * From Specification: * * The authoring conformance requirements for character encoding declarations limit them to only * appearing in the first 1024 bytes. User agents are therefore encouraged to use the prescan * algorithm below (as invoked by these steps) on the first 1024 bytes, but not to stall beyond that. */ _encoding = myencoding_prescan_stream_to_determine_encoding(html.data(), std::min(html.size(), 1024)); if (_encoding == MyENCODING_NOT_DETERMINED) { assert_unconditional_r("Failed to determine data encoding"); _encoding = MyENCODING_UTF_8; } assert_r(myhtml_parse(_tree, _encoding, html.data(), html.size()) == MyCORE_STATUS_OK); myhtml_tree_destroy(_tree); return _tags; }
int main(int argc, const char * argv[]) { const char* path; if (argc == 2) { path = argv[1]; } else { printf("Bad ARGV!\nUse: callback_tree_node_high_level <path_to_html_file>\n"); exit(EXIT_FAILURE); } struct res_html res = load_html_file(path); // basic init myhtml_t* myhtml = myhtml_create(); myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // init tree myhtml_tree_t* tree = myhtml_tree_create(); myhtml_tree_init(tree, myhtml); // set callbacks myhtml_callback_tree_node_insert_set(tree, callback_node_insert, NULL); myhtml_callback_tree_node_remove_set(tree, callback_node_remove, NULL); // parse html myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size); // release resources myhtml_tree_destroy(tree); myhtml_destroy(myhtml); free(res.html); return 0; }