const std::vector<CMyHtmlParser::HtmlTag>& CMyHtmlParser::parse(const QByteArray& html)
{
	_tags.clear();

	_tree = myhtml_tree_create();
	assert_r(_tree);
	assert_r(myhtml_tree_init(_tree, _myhtmlInstance) == MyCORE_STATUS_OK);
	myhtml_callback_tree_node_insert_set(_tree, &CMyHtmlParser::callbackNodeInserted, this);

	/*
	* From Specification:
	*
	* The authoring conformance requirements for character encoding declarations limit them to only
	* appearing in the first 1024 bytes. User agents are therefore encouraged to use the prescan
	* algorithm below (as invoked by these steps) on the first 1024 bytes, but not to stall beyond that.
	*/
	_encoding = myencoding_prescan_stream_to_determine_encoding(html.data(), std::min(html.size(), 1024));
	if (_encoding == MyENCODING_NOT_DETERMINED)
	{
		assert_unconditional_r("Failed to determine data encoding");
		_encoding = MyENCODING_UTF_8;
	}

	assert_r(myhtml_parse(_tree, _encoding, html.data(), html.size()) == MyCORE_STATUS_OK);
	myhtml_tree_destroy(_tree);

	return _tags;
}
Exemple #2
0
void test_all(void)
{
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    DIR *dir;
    struct dirent *ent;
    struct stat path_stat;
    
    const char *from_dir = "/new/Test/out/";
    size_t from_dir_len = strlen(from_dir);
    
    char path[4096];
    strncpy(path, from_dir, from_dir_len);
    
    size_t count = 0;
    
    if((dir = opendir(from_dir)) != NULL)
    {
        while((ent = readdir(dir)) != NULL)
        {
            sprintf(&path[from_dir_len], "%s", ent->d_name);
            
            stat(path, &path_stat);
            
            if(ent->d_name[0] != '.' && !S_ISDIR(path_stat.st_mode))
            {
                count++;
//                printf("%zu: %s\n", count, path);
                
                struct res_html res = load_html(path);
                myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size);
                
//                myhtml_tree_node_t **node_list = myhtml_get_elements_by_tag_id(tree, MyHTML_TAG_TITLE, NULL);
//                
//                if(node_list && node_list[0])
//                    if(node_list[0]->token)
//                        myhtml_tree_print_by_tree_idx(tree, node_list[0]->child, stdout, 0);
//                
//                myhtml_destroy_node_list(node_list);
                
                free(res.html);
            }
        }
        
        closedir (dir);
    }
    
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
}
int main(int argc, const char * argv[])
{
    const char* path;

    if (argc == 2) {
        path = argv[1];
    }
    else {
        printf("Bad ARGV!\nUse: get_title_high_level <path_to_html_file>\n");
        exit(EXIT_FAILURE);
    }
    
    struct res_html res = load_html_file(path);
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size);
    
    // parse html
    myhtml_collection_t *collection = myhtml_get_nodes_by_tag_id(tree, NULL, MyHTML_TAG_TITLE, NULL);
    
    if(collection && collection->list && collection->length) {
        myhtml_tree_node_t *text_node = myhtml_node_child(collection->list[0]);
        
        if(text_node) {
            const char* text = myhtml_node_text(text_node, NULL);
            
            if(text)
                printf("Title: %s\n", text);
        }
    }
    
    // release resources
    myhtml_collection_destroy(collection);
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    free(res.html);
    
    return 0;
}
Exemple #4
0
void chunk_test(void)
{
    struct res_html res = load_html("/new/C-git/myhtml/test/html/chunk.data");
    //struct res_html res = load_html("/new/C-git/myhtml/test/broken.html");
    
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    myhtml_encoding_set(tree, MyHTML_ENCODING_UTF_8);
    
    size_t begin = 0, i = 0;
    while (i < res.size)
    {
        if(res.html[i] == '\n') {
            //printf("Parse chunk: %.*s\n", (int)(i - begin), &res.html[begin]);
            myhtml_parse_chunk(tree, &res.html[begin], (i - begin));
            myhtml_tokenizer_wait(tree);
            
            begin = i + 1;
        }
        
        i++;
    }
    
    myhtml_parse_chunk(tree, &res.html[begin], (i - begin));
    myhtml_parse_chunk_end(tree);
    
    i = 0;
    while (i < res.size)
    {
        if(res.html[i] != '\n') {
            printf("%c", res.html[i]);
        }
        
        i++;
    }
    printf("\n");
    
    myhtml_tree_print_node_childs(tree, tree->document, stdout, 0);
    
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    free(res.html);
}
int main(int argc, const char * argv[])
{
    const char* path;

    if (argc == 2) {
        path = argv[1];
    }
    else {
        printf("Bad ARGV!\nUse: get_title_low_level <path_to_html_file>\n");
        exit(EXIT_FAILURE);
    }
    
    struct res_html res = load_html_file(path);
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size);
    
    // get title from index
    myhtml_collection_t *titles_list = myhtml_get_nodes_by_tag_id(tree, NULL, MyHTML_TAG_TITLE, NULL);
    
    if(titles_list && titles_list->length != 0 && titles_list->list[0]->child) {
        mycore_string_raw_t str = {0};
        myhtml_serialization_node(titles_list->list[0]->child, &str);
        
        printf("%s\n", str.data);
        
        mycore_string_raw_destroy(&str, false);
    }
    
    // release resources
    myhtml_collection_destroy(titles_list);
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    free(res.html);
    
    return 0;
}
int main(int argc, const char * argv[])
{
    const char* path;

    if (argc == 2) {
        path = argv[1];
    }
    else {
        printf("Bad ARGV!\nUse: serialization_high_level <path_to_html_file>\n");
        exit(EXIT_FAILURE);
    }
    
    struct res_html res = load_html_file(path);
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size);
    
    mycore_string_raw_t str_raw;
    mycore_string_raw_clean_all(&str_raw);
    
    if(myhtml_serialization_tree_buffer(myhtml_tree_get_document(tree), &str_raw)) {
        fprintf(stderr, "Could not serialization for the tree\n");
        exit(EXIT_FAILURE);
    }
    
    printf("%s", str_raw.data);
    mycore_string_raw_destroy(&str_raw, false);
    
    // release resources
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    free(res.html);
    
    return 0;
}
int main(int argc, const char * argv[])
{
    const char* path;
    const char* attr_key;
    
    if (argc == 3) {
        attr_key = argv[1];
        path = argv[2];
    }
    else {
        printf("Bad ARGV!\nUse: nodes_by_attr_key_high_level <attribute key> <path to html file>\n");
        exit(EXIT_FAILURE);
    }
    
    struct res_html res = load_html_file(path);
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size);
    
    // get and print
    myhtml_collection_t *collection = myhtml_get_nodes_by_attribute_key(tree, NULL, NULL, attr_key, strlen(attr_key), NULL);
    
    for(size_t i = 0; i < collection->length; i++)
        myhtml_tree_print_node(tree, collection->list[i], stdout);
    
    printf("Total found: %zu\n", collection->length);
    
    myhtml_collection_destroy(collection);
    
    // release resources
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    return 0;
}
int main(int argc, const char * argv[])
{
    const char* path;
    
    if (argc == 2) {
        path = argv[1];
    }
    else {
        printf("Bad ARGV!\nUse: callback_tree_node_high_level <path_to_html_file>\n");
        exit(EXIT_FAILURE);
    }
    
    struct res_html res = load_html_file(path);
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // set callbacks
    myhtml_callback_tree_node_insert_set(tree, callback_node_insert, NULL);
    myhtml_callback_tree_node_remove_set(tree, callback_node_remove, NULL);
    
    // parse html
    myhtml_parse(tree, MyENCODING_UTF_8, res.html, res.size);
    
    // release resources
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    free(res.html);
    
    return 0;
}
int main(int argc, const char * argv[])
{
    char html[] = "<div><span>Best of Fragments</span><a>click to make happy</a></div>";
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse_fragment(tree, MyHTML_ENCODING_UTF_8, html, strlen(html), MyHTML_TAG_DIV, MyHTML_NAMESPACE_HTML);
    
    // print fragment
    myhtml_tree_print_node_children(tree, myhtml_tree_get_document(tree), stdout, 0);
    
    // release resources
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    return 0;
}
Exemple #10
0
int main(int argc, const char * argv[])
{
//    chunk_test();
//    return 0;
//
    //myhtml_encoding_decode_single_byte(0xf4);
    
//    myhtml_encoding_result_t res_data = {0, 0, 0, 0, 0, 0};
//    
//    unsigned char data[20];
//    data[0] = 0xAC;
//    data[1] = 0xB1;
//    data[2] = 0x42;
//    data[3] = 0x4F;
//    data[4] = 0x51;
//    data[5] = 0x1B;
//    data[6] = 0x28;
//    data[7] = 0x42;
//    data[8] = 0;
//    
//    //myhtml_encoding_dec_to_char(54936, data);
//    
//    enum myhtml_encoding_status status = MyHTML_ENCODING_STATUS_CONTINUE;
//    
//    size_t i = 0;
//    while (status & MyHTML_ENCODING_STATUS_CONTINUE) {
//        status = myhtml_encoding_decode_euc_kr(data[i], &res_data);
//        i++;
//    }
//    
//    myhtml_string_convert_dec_to_ansi_utf8(res_data.result, (char *)data);
//    
//    return 0;
//    chunk_test();
//    uint64_t all_start1 = myhtml_hperf_clock(NULL);
//    test_all();
//    uint64_t all_stop1 = myhtml_hperf_clock(NULL);
//
//    myhtml_hperf_print("Parse html", all_start1, all_stop1, stdout);
//    return 0;
    
    /* Default path or argument value */
    //const char* path = "/new/C-git/myhtml/test/test.html";
    //const char* path = "/new/C-git/broken.html";
    const char* path = "/new/C-git/test_full.html";
    
    if (argc == 2) {
        path = argv[1];
    }
    
    setbuf(stdout, 0);
    
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    struct res_html res  = load_html(path);
    
    uint64_t all_start = myhtml_hperf_clock(NULL);
    uint64_t tree_init_start = myhtml_hperf_clock(NULL);
    
    // init once for N html
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    uint64_t tree_init_stop = myhtml_hperf_clock(NULL);
    uint64_t parse_start = myhtml_hperf_clock(NULL);
    
    myhtml_encoding_t encoding;
    myhtml_encoding_detect(res.html, res.size, &encoding);
    
    for(size_t i = 0; i < 1; i++)
    {
        //myhtml_parse(tree, text, strlen(text));
        myhtml_parse_single(tree, encoding, res.html, res.size);
//        myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size);
        
//        myhtml_tree_print_node_childs(tree, tree->document, stdout, 0);
    }
    //usleep(100000000);
    
    uint64_t parse_stop = myhtml_hperf_clock(NULL);
    uint64_t all_stop = myhtml_hperf_clock(NULL);
    
    printf("\n\nInformation:\n");
    printf("Timer (%llu ticks/sec):\n", (unsigned long long) myhtml_hperf_res(NULL));
    myhtml_hperf_print("\tFirst Tree init", tree_init_start, tree_init_stop, stdout);
    myhtml_hperf_print("\tParse html", parse_start, parse_stop, stdout);
    myhtml_hperf_print("\tTotal", all_start, all_stop, stdout);
    printf("\n");
    
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    free(res.html);
    
    return 0;
}
int main(int argc, const char * argv[])
{
    const char* path;
    const char* attr_value;
    
    struct res_argv rargv;
    
    if(argc > 2) {
        path = argv[1];
        attr_value = argv[2];
        
        rargv = get_argv(3, argc, argv);
    }
    else {
        print_usage();
        exit(EXIT_FAILURE);
    }
    
    struct res_html res = load_html_file(path);
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse(tree, MyHTML_ENCODING_UTF_8, res.html, res.size);
    
    // get and print
    myhtml_collection_t* collection = NULL;
    
    switch (rargv.search_type) {
        case 0:
            collection = myhtml_get_nodes_by_attribute_value(tree, NULL, NULL, rargv.is_insensitive,
                                                             rargv.key, rargv.key_length,
                                                             attr_value, strlen(attr_value), NULL);
            break;
        case 1:
            collection = myhtml_get_nodes_by_attribute_value_whitespace_separated(tree, NULL, NULL, rargv.is_insensitive,
                                                                                  rargv.key, rargv.key_length,
                                                                                  attr_value, strlen(attr_value), NULL);
            break;
        case 2:
            collection = myhtml_get_nodes_by_attribute_value_begin(tree, NULL, NULL, rargv.is_insensitive,
                                                                   rargv.key, rargv.key_length,
                                                                   attr_value, strlen(attr_value), NULL);
            break;
        case 3:
            collection = myhtml_get_nodes_by_attribute_value_end(tree, NULL, NULL, rargv.is_insensitive,
                                                                 rargv.key, rargv.key_length,
                                                                 attr_value, strlen(attr_value), NULL);
            break;
        case 4:
            collection = myhtml_get_nodes_by_attribute_value_contain(tree, NULL, NULL, rargv.is_insensitive,
                                                                     rargv.key, rargv.key_length,
                                                                     attr_value, strlen(attr_value), NULL);
            break;
        case 5:
            collection = myhtml_get_nodes_by_attribute_value_hyphen_separated(tree, NULL, NULL, rargv.is_insensitive,
                                                                              rargv.key, rargv.key_length,
                                                                              attr_value, strlen(attr_value), NULL);
            break;
            
        default:
            print_usage();
            exit(EXIT_FAILURE);
    }
    
    if(collection) {
        for(size_t i = 0; i < collection->length; i++)
            myhtml_tree_print_node(tree, collection->list[i], stdout);
        
        printf("Total found: %zu\n", collection->length);
    }
    
    myhtml_collection_destroy(collection);
    
    // release resources
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    return 0;
}