/* ////////////////////////////////////////////////////////////////////////////////////// * implementation */ tb_database_sql_ref_t tb_database_sql_init(tb_char_t const* url) { // check tb_assert_and_check_return_val(url, tb_null); // done tb_bool_t ok = tb_false; tb_database_sql_ref_t database = tb_null; tb_url_t database_url; do { // trace tb_trace_d("init: %s: ..", url); // init url if (!tb_url_init(&database_url)) break; // make url if (!tb_url_set(&database_url, url)) break; // check protocol tb_size_t protocol = tb_url_protocol_get(&database_url); tb_assert_and_check_break(protocol == TB_URL_PROTOCOL_SQL || protocol == TB_URL_PROTOCOL_FILE); // the probe func static tb_size_t (*s_probe[])(tb_url_t const*) = { tb_null #ifdef TB_CONFIG_THIRD_HAVE_MYSQL , tb_database_mysql_probe #endif #ifdef TB_CONFIG_THIRD_HAVE_SQLITE3 , tb_database_sqlite3_probe #endif }; // the init func static tb_database_sql_ref_t (*s_init[])(tb_url_t const*) = { tb_null #ifdef TB_CONFIG_THIRD_HAVE_MYSQL , tb_database_mysql_init #endif #ifdef TB_CONFIG_THIRD_HAVE_SQLITE3 , tb_database_sqlite3_init #endif }; // probe the database type tb_size_t i = 1; tb_size_t n = tb_arrayn(s_probe); tb_size_t s = 0; tb_size_t m = 0; for (; i < n; i++) { if (s_probe[i]) { // probe it tb_size_t score = s_probe[i](&database_url); if (score > s) { // save the max score s = score; m = i; // ok? if (score == 100) break; } } } tb_check_break(m < n && s_init[m]); // init it database = s_init[m](&database_url); tb_assert_and_check_break(database); // trace tb_trace_d("init: %s: ok", url); // ok ok = tb_true; } while (0); // exit url tb_url_exit(&database_url); // failed? if (!ok) { // trace tb_trace_d("init: %s: no", url); // exit database if (database) tb_database_sql_exit(database); database = tb_null; } // ok? return database; }
static tb_void_t tb_demo_spider_parser_task_done(tb_thread_pool_worker_ref_t worker, tb_cpointer_t priv) { // check tb_demo_spider_task_t* task = (tb_demo_spider_task_t*)priv; tb_assert_and_check_return(worker && task && task->spider); // init parser tb_demo_spider_parser_t* parser = tb_demo_spider_parser_init(worker); tb_assert_and_check_return(parser && parser->stream && parser->reader && parser->cache); // open stream if (tb_demo_spider_parser_open_html(parser->stream, task->ourl)) { // open reader if (tb_xml_reader_open(parser->reader, parser->stream, tb_false)) { // trace tb_trace_d("parser: open: %s", task->ourl); // init url tb_url_set(&parser->iurl, task->iurl); // parse url while ( TB_STATE_OK == tb_atomic_get(&task->spider->state) && tb_demo_spider_parser_get_url(parser->reader, &parser->iurl)) { // trace tb_trace_d("parser: done: %s", tb_url_get(&parser->iurl)); // done task tb_bool_t full = tb_false; if (!tb_demo_spider_task_done(task->spider, tb_url_get(&parser->iurl), &full)) { // full? tb_assert_and_check_break(full); // cache url if (!tb_circle_queue_full(parser->cache)) tb_circle_queue_put(parser->cache, tb_url_get(&parser->iurl)); // trace tb_trace_d("parser: cache: save: %s, size: %lu", tb_url_get(&parser->iurl), tb_circle_queue_size(parser->cache)); } } // clos reader tb_xml_reader_clos(parser->reader); } // clos stream tb_stream_clos(parser->stream); } // done task from the cache while (!tb_circle_queue_null(parser->cache)) { // the url tb_char_t const* url = (tb_char_t const*)tb_circle_queue_get(parser->cache); tb_assert_and_check_break(url); // done task if (!tb_demo_spider_task_done(task->spider, url, tb_null)) break; // trace tb_trace_d("parser: cache: load: %s, size: %lu", url, tb_circle_queue_size(parser->cache)); // pop it tb_circle_queue_pop(parser->cache); } }
static tb_bool_t tb_demo_spider_init(tb_demo_spider_t* spider, tb_int_t argc, tb_char_t** argv) { // check tb_assert_and_check_return_val(spider && argc && argv, tb_false); // done tb_bool_t ok = tb_false; do { #ifdef TB_CONFIG_MODULE_HAVE_OBJECT // init option spider->option = tb_option_init("spider", "the spider demo", g_options); tb_assert_and_check_break(spider->option); // done option if (!tb_option_done(spider->option, argc - 1, &argv[1])) break; // check tb_assert_and_check_break(tb_option_find(spider->option, "home")); // init home if (!tb_url_init(&spider->home)) break; tb_url_set(&spider->home, tb_option_item_cstr(spider->option, "home")); tb_trace_d("home: %s", tb_url_get(&spider->home)); // init only home? if (tb_option_find(spider->option, "only")) spider->home_only = tb_option_item_bool(spider->option, "only"); // init root tb_char_t const* root = tb_option_item_cstr(spider->option, "directory"); // init user agent spider->user_agent = tb_option_item_cstr(spider->option, "agent"); // init timeout if (tb_option_find(spider->option, "timeout")) spider->timeout = tb_option_item_sint32(spider->option, "timeout"); // init limited rate if (tb_option_find(spider->option, "rate")) spider->limited_rate = tb_option_item_uint32(spider->option, "rate"); #else // check tb_assert_and_check_break(argv[1]); // init home if (!tb_url_init(&spider->home)) break; spider->home = argv[1]? argv[1] : tb_null; tb_trace_d("home: %s", tb_url_get(&spider->home)); // init root tb_char_t const* root = argv[2]; #endif // the home host tb_char_t const* host = tb_url_host_get(&spider->home); tb_assert_and_check_break(host); // init home domain tb_char_t const* domain = tb_strchr(host, '.'); if (domain) { tb_strlcpy(spider->home_domain, domain, sizeof(spider->home_domain) - 1); spider->home_domain[sizeof(spider->home_domain) - 1] = '\0'; } // using the default root if (root) tb_strlcpy(spider->root, root, sizeof(spider->root) - 1); else { // the temporary root tb_directory_temp(spider->root, sizeof(spider->root) - 1); // append spider tb_strcat(spider->root, "/spider"); } tb_trace_d("root: %s", spider->root); // using the default user agent if (!spider->user_agent) spider->user_agent = TB_DEMO_SPIDER_USER_AGENT; // using the default timeout if (!spider->timeout) spider->timeout = TB_DEMO_SPIDER_TASK_TIMEOUT; // using the default rate if (!spider->limited_rate) spider->limited_rate = TB_DEMO_SPIDER_TASK_RATE; // strip root tail: '/' or '\\' tb_size_t size = tb_strlen(spider->root); if (size && (spider->root[size - 1] == '/' || spider->root[size - 1] == '\\')) spider->root[size - 1] = '\0'; // init state spider->state = TB_STATE_OK; // init lock if (!tb_spinlock_init(&spider->lock)) break; // init pool spider->pool = tb_fixed_pool_init(tb_null, TB_DEMO_SPIDER_TASK_MAXN >> 2, sizeof(tb_demo_spider_task_t), tb_null, tb_null, tb_null); tb_assert_and_check_break(spider->pool); // init filter spider->filter = tb_bloom_filter_init(TB_BLOOM_FILTER_PROBABILITY_0_001, 3, TB_DEMO_SPIDER_FILTER_MAXN, tb_item_func_str(tb_true)); tb_assert_and_check_break(spider->filter); // register lock profiler #ifdef TB_LOCK_PROFILER_ENABLE tb_lock_profiler_register(tb_lock_profiler(), (tb_pointer_t)&spider->lock, "spider"); #endif // ok ok = tb_true; } while (0); // failed? help it #ifdef TB_CONFIG_MODULE_HAVE_OBJECT if (!ok && spider->option) tb_option_help(spider->option); #endif // ok? return ok; }
static tb_size_t tb_demo_spider_parser_get_url(tb_xml_reader_ref_t reader, tb_url_ref_t url) { // check tb_assert_and_check_return_val(reader && url, tb_false); // done tb_size_t ok = 0; tb_size_t event = TB_XML_READER_EVENT_NONE; while (!ok && (event = tb_xml_reader_next(reader))) { switch (event) { case TB_XML_READER_EVENT_ELEMENT_EMPTY: case TB_XML_READER_EVENT_ELEMENT_BEG: { // the element name tb_char_t const* name = tb_xml_reader_element(reader); tb_check_break(name); // <a href="" />? // <link href="" /> // <img src="" />? // <script src="" />? // <source src="" />? // <frame src="" />? if ( !tb_stricmp(name, "a") || !tb_stricmp(name, "link") || !tb_stricmp(name, "img") || !tb_stricmp(name, "frame") || !tb_stricmp(name, "source")) { // walk attributes tb_xml_node_ref_t attr = (tb_xml_node_ref_t)tb_xml_reader_attributes(reader); for (; attr; attr = attr->next) { // href or src? if ( tb_string_size(&attr->data) && ( !tb_string_cstricmp(&attr->name, "href") || !tb_string_cstricmp(&attr->name, "src"))) { // the url protocol tb_size_t protocol = tb_url_protocol_probe(tb_string_cstr(&attr->data)); // http? if(protocol == TB_URL_PROTOCOL_HTTP) { // save url ok = tb_url_set(url, tb_string_cstr(&attr->data)); } // file? else if (protocol == TB_URL_PROTOCOL_FILE) { // save path tb_url_path_set(url, tb_string_cstr(&attr->data)); // ok ok = tb_true; } } } } } break; default: break; } } // ok? return ok; }