Exemple #1
0
/* //////////////////////////////////////////////////////////////////////////////////////
 * implementation
 */
tb_database_sql_ref_t tb_database_sql_init(tb_char_t const* url)
{
    // check
    tb_assert_and_check_return_val(url, tb_null);

    // done
    tb_bool_t                   ok = tb_false;
    tb_database_sql_ref_t       database = tb_null;
    tb_url_t                    database_url;
    do
    {
        // trace
        tb_trace_d("init: %s: ..", url);

        // init url
        if (!tb_url_init(&database_url)) break;

        // make url
        if (!tb_url_set(&database_url, url)) break;

        // check protocol
        tb_size_t protocol = tb_url_protocol_get(&database_url);
        tb_assert_and_check_break(protocol == TB_URL_PROTOCOL_SQL || protocol == TB_URL_PROTOCOL_FILE);

        // the probe func
        static tb_size_t (*s_probe[])(tb_url_t const*) = 
        {
            tb_null
#ifdef TB_CONFIG_THIRD_HAVE_MYSQL
        ,   tb_database_mysql_probe
#endif
#ifdef TB_CONFIG_THIRD_HAVE_SQLITE3
        ,   tb_database_sqlite3_probe
#endif
        };

        // the init func
        static tb_database_sql_ref_t (*s_init[])(tb_url_t const*) = 
        {
            tb_null
#ifdef TB_CONFIG_THIRD_HAVE_MYSQL
        ,   tb_database_mysql_init
#endif
#ifdef TB_CONFIG_THIRD_HAVE_SQLITE3
        ,   tb_database_sqlite3_init
#endif
        };

        // probe the database type
        tb_size_t i = 1;
        tb_size_t n = tb_arrayn(s_probe);
        tb_size_t s = 0;
        tb_size_t m = 0;
        for (; i < n; i++)
        {
            if (s_probe[i]) 
            {
                // probe it
                tb_size_t score = s_probe[i](&database_url);
                if (score > s) 
                {
                    // save the max score
                    s = score;
                    m = i;

                    // ok?
                    if (score == 100) break;
                }
            }
        }
        tb_check_break(m < n && s_init[m]);

        // init it
        database = s_init[m](&database_url);
        tb_assert_and_check_break(database);

        // trace
        tb_trace_d("init: %s: ok", url);

        // ok
        ok = tb_true;

    } while (0);

    // exit url
    tb_url_exit(&database_url);

    // failed?
    if (!ok)
    {
        // trace
        tb_trace_d("init: %s: no", url);

        // exit database
        if (database) tb_database_sql_exit(database);
        database = tb_null;
    }

    // ok?
    return database;
}
Exemple #2
0
static tb_void_t tb_demo_spider_parser_task_done(tb_thread_pool_worker_ref_t worker, tb_cpointer_t priv)
{
    // check
    tb_demo_spider_task_t* task = (tb_demo_spider_task_t*)priv;
    tb_assert_and_check_return(worker && task && task->spider);

    // init parser
    tb_demo_spider_parser_t* parser = tb_demo_spider_parser_init(worker);
    tb_assert_and_check_return(parser && parser->stream && parser->reader && parser->cache);

    // open stream
    if (tb_demo_spider_parser_open_html(parser->stream, task->ourl))
    {
        // open reader
        if (tb_xml_reader_open(parser->reader, parser->stream, tb_false))
        {
            // trace
            tb_trace_d("parser: open: %s", task->ourl);

            // init url
            tb_url_set(&parser->iurl, task->iurl);

            // parse url
            while (     TB_STATE_OK == tb_atomic_get(&task->spider->state)
                    &&  tb_demo_spider_parser_get_url(parser->reader, &parser->iurl))
            {
                // trace
                tb_trace_d("parser: done: %s", tb_url_get(&parser->iurl));

                // done task
                tb_bool_t full = tb_false;
                if (!tb_demo_spider_task_done(task->spider, tb_url_get(&parser->iurl), &full))
                {
                    // full?
                    tb_assert_and_check_break(full);

                    // cache url
                    if (!tb_circle_queue_full(parser->cache)) tb_circle_queue_put(parser->cache, tb_url_get(&parser->iurl));

                    // trace
                    tb_trace_d("parser: cache: save: %s, size: %lu", tb_url_get(&parser->iurl), tb_circle_queue_size(parser->cache));
                }
            }

            // clos reader
            tb_xml_reader_clos(parser->reader);
        }

        // clos stream
        tb_stream_clos(parser->stream);
    }

    // done task from the cache
    while (!tb_circle_queue_null(parser->cache))
    {
        // the url
        tb_char_t const* url = (tb_char_t const*)tb_circle_queue_get(parser->cache);
        tb_assert_and_check_break(url);

        // done task
        if (!tb_demo_spider_task_done(task->spider, url, tb_null)) break;

        // trace
        tb_trace_d("parser: cache: load: %s, size: %lu", url, tb_circle_queue_size(parser->cache));

        // pop it
        tb_circle_queue_pop(parser->cache);
    }
}
Exemple #3
0
static tb_bool_t tb_demo_spider_init(tb_demo_spider_t* spider, tb_int_t argc, tb_char_t** argv)
{
    // check
    tb_assert_and_check_return_val(spider && argc && argv, tb_false);

    // done
    tb_bool_t ok = tb_false;
    do
    {
#ifdef TB_CONFIG_MODULE_HAVE_OBJECT
        // init option
        spider->option = tb_option_init("spider", "the spider demo", g_options);
        tb_assert_and_check_break(spider->option);
 
        // done option
        if (!tb_option_done(spider->option, argc - 1, &argv[1])) break;

        // check
        tb_assert_and_check_break(tb_option_find(spider->option, "home"));

        // init home
        if (!tb_url_init(&spider->home)) break;
        tb_url_set(&spider->home, tb_option_item_cstr(spider->option, "home"));
        tb_trace_d("home: %s", tb_url_get(&spider->home));

        // init only home?
        if (tb_option_find(spider->option, "only"))
            spider->home_only = tb_option_item_bool(spider->option, "only");

        // init root
        tb_char_t const* root = tb_option_item_cstr(spider->option, "directory");

        // init user agent
        spider->user_agent = tb_option_item_cstr(spider->option, "agent");

        // init timeout
        if (tb_option_find(spider->option, "timeout"))
            spider->timeout = tb_option_item_sint32(spider->option, "timeout");

        // init limited rate
        if (tb_option_find(spider->option, "rate"))
            spider->limited_rate = tb_option_item_uint32(spider->option, "rate");
#else

        // check
        tb_assert_and_check_break(argv[1]);

        // init home
        if (!tb_url_init(&spider->home)) break;
        spider->home = argv[1]? argv[1] : tb_null;
        tb_trace_d("home: %s", tb_url_get(&spider->home));

        // init root
        tb_char_t const* root = argv[2];
#endif

        // the home host
        tb_char_t const* host = tb_url_host_get(&spider->home);
        tb_assert_and_check_break(host);

        // init home domain
        tb_char_t const* domain = tb_strchr(host, '.');
        if (domain)
        {
            tb_strlcpy(spider->home_domain, domain, sizeof(spider->home_domain) - 1);
            spider->home_domain[sizeof(spider->home_domain) - 1] = '\0';
        }

        // using the default root
        if (root) tb_strlcpy(spider->root, root, sizeof(spider->root) - 1);
        else 
        {
            // the temporary root
            tb_directory_temp(spider->root, sizeof(spider->root) - 1);

            // append spider
            tb_strcat(spider->root, "/spider");
        }
        tb_trace_d("root: %s", spider->root);

        // using the default user agent
        if (!spider->user_agent) spider->user_agent = TB_DEMO_SPIDER_USER_AGENT;

        // using the default timeout
        if (!spider->timeout) spider->timeout = TB_DEMO_SPIDER_TASK_TIMEOUT;

        // using the default rate
        if (!spider->limited_rate) spider->limited_rate = TB_DEMO_SPIDER_TASK_RATE;

        // strip root tail: '/' or '\\'
        tb_size_t size = tb_strlen(spider->root);
        if (size && (spider->root[size - 1] == '/' || spider->root[size - 1] == '\\')) spider->root[size - 1] = '\0';

        // init state
        spider->state = TB_STATE_OK;

        // init lock
        if (!tb_spinlock_init(&spider->lock)) break;

        // init pool
        spider->pool = tb_fixed_pool_init(tb_null, TB_DEMO_SPIDER_TASK_MAXN >> 2, sizeof(tb_demo_spider_task_t), tb_null, tb_null, tb_null);
        tb_assert_and_check_break(spider->pool);

        // init filter
        spider->filter = tb_bloom_filter_init(TB_BLOOM_FILTER_PROBABILITY_0_001, 3, TB_DEMO_SPIDER_FILTER_MAXN, tb_item_func_str(tb_true));
        tb_assert_and_check_break(spider->filter);

        // register lock profiler
#ifdef TB_LOCK_PROFILER_ENABLE
        tb_lock_profiler_register(tb_lock_profiler(), (tb_pointer_t)&spider->lock, "spider");
#endif

        // ok
        ok = tb_true;

    } while (0);

    // failed? help it
#ifdef TB_CONFIG_MODULE_HAVE_OBJECT
    if (!ok && spider->option) tb_option_help(spider->option);
#endif

    // ok?
    return ok;
}
Exemple #4
0
static tb_size_t tb_demo_spider_parser_get_url(tb_xml_reader_ref_t reader, tb_url_ref_t url)
{
    // check
    tb_assert_and_check_return_val(reader && url, tb_false);

    // done
    tb_size_t ok = 0;
    tb_size_t event = TB_XML_READER_EVENT_NONE;
    while (!ok && (event = tb_xml_reader_next(reader)))
    {
        switch (event)
        {
        case TB_XML_READER_EVENT_ELEMENT_EMPTY: 
        case TB_XML_READER_EVENT_ELEMENT_BEG: 
            {
                // the element name
                tb_char_t const* name = tb_xml_reader_element(reader);
                tb_check_break(name);

                // <a href="" />? 
                // <link href="" /> 
                // <img src="" />? 
                // <script src="" />? 
                // <source src="" />? 
                // <frame src="" />? 
                if (    !tb_stricmp(name, "a")
                    ||  !tb_stricmp(name, "link")
                    ||  !tb_stricmp(name, "img")
                    ||  !tb_stricmp(name, "frame")
                    ||  !tb_stricmp(name, "source"))
                {
                    // walk attributes
                    tb_xml_node_ref_t attr = (tb_xml_node_ref_t)tb_xml_reader_attributes(reader); 
                    for (; attr; attr = attr->next)
                    {
                        // href or src?
                        if (    tb_string_size(&attr->data)
                            &&  (   !tb_string_cstricmp(&attr->name, "href")
                                ||  !tb_string_cstricmp(&attr->name, "src")))
                        {
                            // the url protocol
                            tb_size_t protocol = tb_url_protocol_probe(tb_string_cstr(&attr->data));

                            // http?
                            if(protocol == TB_URL_PROTOCOL_HTTP)
                            {
                                // save url
                                ok = tb_url_set(url, tb_string_cstr(&attr->data));
                            }
                            // file?
                            else if (protocol == TB_URL_PROTOCOL_FILE)
                            {
                                // save path
                                tb_url_path_set(url, tb_string_cstr(&attr->data));

                                // ok
                                ok = tb_true;
                            }
                        }
                    }
                }
            }
            break;
        default:
            break;
        }
    }

    // ok?
    return ok;
}