static tb_bool_t tb_demo_spider_make_ourl(tb_demo_spider_t* spider, tb_char_t const* url, tb_char_t* data, tb_size_t maxn) { // check tb_assert_and_check_return_val(spider && url && data && maxn, tb_false); // skip protocol tb_char_t* p = (tb_char_t*)url; if (!tb_strnicmp(p, "http://", 7)) p += 7; else if (!tb_strnicmp(p, "https://", 8)) p += 8; // skip space while (*p && tb_isspace(*p)) p++; // format ourl tb_long_t n = tb_snprintf(data, maxn, "%s/%s", spider->root, p); tb_assert_and_check_return_val(n > 0 && n < maxn, tb_false); // no root? append '/' if (!tb_strchr(p, '/') && !tb_strchr(p, '\\')) data[n++] = '/'; tb_assert_and_check_return_val(n < maxn, tb_false); // '\\' => '/' if (data[n - 1] == '/') data[n - 1] = '/'; // directory? append index.html if (data[n - 1] == '/') n += tb_snprintf(data + n, maxn - n, "%s", "index.html"); tb_assert_and_check_return_val(n > 0 && n < maxn, tb_false); // end data[n] = '\0'; // replace '?' => '_' p = data; while (*p) { // replace if (*p == '?') *p = '_'; // next p++; } // trace tb_trace_d("make: %s => %s", url, data); // ok? return n > 0? tb_true : tb_false; }
tb_long_t tb_string_strchr(tb_string_ref_t string, tb_size_t p, tb_char_t c) { // check tb_char_t const* s = tb_string_cstr(string); tb_size_t n = tb_string_size(string); tb_assert_and_check_return_val(s && p && p < n, -1); // done tb_char_t* q = tb_strchr(s + p, c); return (q? q - s : -1); }
tb_bool_t tb_environment_save(tb_environment_ref_t environment, tb_char_t const* name) { // check tb_assert_and_check_return_val(environment && name, tb_false); // empty? remove this environment variable if (!tb_vector_size(environment)) return !unsetenv(name); // init values string tb_string_t values; if (!tb_string_init(&values)) return tb_false; // make values string tb_for_all_if (tb_char_t const*, value, environment, value) { // the single value cannot exist ':' tb_assertf(!tb_strchr(value, ':'), "invalid value: %s", value); // append value tb_string_cstrcat(&values, value); tb_string_chrcat(&values, ':'); }
tb_bool_t tb_environment_add(tb_char_t const* name, tb_char_t const* values, tb_bool_t to_head) { // check tb_assert_and_check_return_val(name && values, tb_false); // find the first separator position tb_bool_t ok = tb_false; tb_char_t const* p = values? tb_strchr(values, TM_ENVIRONMENT_SEP) : tb_null; if (p) { // init filter tb_hash_set_ref_t filter = tb_hash_set_init(8, tb_element_str(tb_true)); // init environment tb_char_t data[TB_PATH_MAXN]; tb_environment_ref_t environment = tb_environment_init(); if (environment) { // load the previous values tb_environment_load(environment, name); // make environment tb_char_t const* b = values; tb_char_t const* e = b + tb_strlen(values); do { // not empty? if (b < p) { // the size tb_size_t size = tb_min(p - b, sizeof(data) - 1); // copy it tb_strncpy(data, b, size); data[size] = '\0'; // have been not inserted? if (!filter || !tb_hash_set_get(filter, data)) { // append the environment tb_environment_insert(environment, data, to_head); // save it to the filter tb_hash_set_insert(filter, data); } } // end? tb_check_break(p + 1 < e); // find the next separator position b = p + 1; p = tb_strchr(b, TM_ENVIRONMENT_SEP); if (!p) p = e; } while (1); // set environment variables ok = tb_environment_save(environment, name); // exit environment tb_environment_exit(environment); } // exit filter if (filter) tb_hash_set_exit(filter); filter = tb_null; } // only one? else { // set environment variables tb_environment_ref_t environment = tb_environment_init(); if (environment) { // load the previous values tb_environment_load(environment, name); // append the environment tb_environment_insert(environment, values, to_head); // set environment variables ok = tb_environment_save(environment, name); // exit environment tb_environment_exit(environment); } } // ok? return ok; }
static tb_bool_t tb_demo_spider_init(tb_demo_spider_t* spider, tb_int_t argc, tb_char_t** argv) { // check tb_assert_and_check_return_val(spider && argc && argv, tb_false); // done tb_bool_t ok = tb_false; do { #ifdef TB_CONFIG_MODULE_HAVE_OBJECT // init option spider->option = tb_option_init("spider", "the spider demo", g_options); tb_assert_and_check_break(spider->option); // done option if (!tb_option_done(spider->option, argc - 1, &argv[1])) break; // check tb_assert_and_check_break(tb_option_find(spider->option, "home")); // init home if (!tb_url_init(&spider->home)) break; tb_url_set(&spider->home, tb_option_item_cstr(spider->option, "home")); tb_trace_d("home: %s", tb_url_get(&spider->home)); // init only home? if (tb_option_find(spider->option, "only")) spider->home_only = tb_option_item_bool(spider->option, "only"); // init root tb_char_t const* root = tb_option_item_cstr(spider->option, "directory"); // init user agent spider->user_agent = tb_option_item_cstr(spider->option, "agent"); // init timeout if (tb_option_find(spider->option, "timeout")) spider->timeout = tb_option_item_sint32(spider->option, "timeout"); // init limited rate if (tb_option_find(spider->option, "rate")) spider->limited_rate = tb_option_item_uint32(spider->option, "rate"); #else // check tb_assert_and_check_break(argv[1]); // init home if (!tb_url_init(&spider->home)) break; spider->home = argv[1]? argv[1] : tb_null; tb_trace_d("home: %s", tb_url_get(&spider->home)); // init root tb_char_t const* root = argv[2]; #endif // the home host tb_char_t const* host = tb_url_host_get(&spider->home); tb_assert_and_check_break(host); // init home domain tb_char_t const* domain = tb_strchr(host, '.'); if (domain) { tb_strlcpy(spider->home_domain, domain, sizeof(spider->home_domain) - 1); spider->home_domain[sizeof(spider->home_domain) - 1] = '\0'; } // using the default root if (root) tb_strlcpy(spider->root, root, sizeof(spider->root) - 1); else { // the temporary root tb_directory_temp(spider->root, sizeof(spider->root) - 1); // append spider tb_strcat(spider->root, "/spider"); } tb_trace_d("root: %s", spider->root); // using the default user agent if (!spider->user_agent) spider->user_agent = TB_DEMO_SPIDER_USER_AGENT; // using the default timeout if (!spider->timeout) spider->timeout = TB_DEMO_SPIDER_TASK_TIMEOUT; // using the default rate if (!spider->limited_rate) spider->limited_rate = TB_DEMO_SPIDER_TASK_RATE; // strip root tail: '/' or '\\' tb_size_t size = tb_strlen(spider->root); if (size && (spider->root[size - 1] == '/' || spider->root[size - 1] == '\\')) spider->root[size - 1] = '\0'; // init state spider->state = TB_STATE_OK; // init lock if (!tb_spinlock_init(&spider->lock)) break; // init pool spider->pool = tb_fixed_pool_init(tb_null, TB_DEMO_SPIDER_TASK_MAXN >> 2, sizeof(tb_demo_spider_task_t), tb_null, tb_null, tb_null); tb_assert_and_check_break(spider->pool); // init filter spider->filter = tb_bloom_filter_init(TB_BLOOM_FILTER_PROBABILITY_0_001, 3, TB_DEMO_SPIDER_FILTER_MAXN, tb_item_func_str(tb_true)); tb_assert_and_check_break(spider->filter); // register lock profiler #ifdef TB_LOCK_PROFILER_ENABLE tb_lock_profiler_register(tb_lock_profiler(), (tb_pointer_t)&spider->lock, "spider"); #endif // ok ok = tb_true; } while (0); // failed? help it #ifdef TB_CONFIG_MODULE_HAVE_OBJECT if (!ok && spider->option) tb_option_help(spider->option); #endif // ok? return ok; }