示例#1
0
文件: spider.c 项目: ahnan4arch/tbox
static tb_bool_t tb_demo_spider_make_ourl(tb_demo_spider_t* spider, tb_char_t const* url, tb_char_t* data, tb_size_t maxn)
{
    // check
    tb_assert_and_check_return_val(spider && url && data && maxn, tb_false);

    // skip protocol
	tb_char_t* p = (tb_char_t*)url;
	if (!tb_strnicmp(p, "http://", 7)) p += 7;
	else if (!tb_strnicmp(p, "https://", 8)) p += 8;

    // skip space
    while (*p && tb_isspace(*p)) p++;

	// format ourl
	tb_long_t n = tb_snprintf(data, maxn, "%s/%s", spider->root, p);
    tb_assert_and_check_return_val(n > 0 && n < maxn, tb_false);

    // no root? append '/'
    if (!tb_strchr(p, '/') && !tb_strchr(p, '\\')) data[n++] = '/';
    tb_assert_and_check_return_val(n < maxn, tb_false);

    // '\\' => '/'
    if (data[n - 1] == '/') data[n - 1] = '/';

    // directory? append index.html
    if (data[n - 1] == '/') n += tb_snprintf(data + n, maxn - n, "%s", "index.html");
    tb_assert_and_check_return_val(n > 0 && n < maxn, tb_false);

    // end
	data[n] = '\0';

    // replace '?' => '_'
    p = data;
    while (*p)
    {
        // replace
        if (*p == '?') *p = '_';

        // next
        p++;
    }

	// trace
	tb_trace_d("make: %s => %s", url, data);

	// ok?
	return n > 0? tb_true : tb_false;
}
示例#2
0
文件: string.c 项目: richwu/tbox
tb_long_t tb_string_strchr(tb_string_ref_t string, tb_size_t p, tb_char_t c)
{
    // check
    tb_char_t const*    s = tb_string_cstr(string);
    tb_size_t           n = tb_string_size(string);
    tb_assert_and_check_return_val(s && p && p < n, -1);

    // done
    tb_char_t* q = tb_strchr(s + p, c);
    return (q? q - s : -1);
}
示例#3
0
tb_bool_t tb_environment_save(tb_environment_ref_t environment, tb_char_t const* name)
{
    // check
    tb_assert_and_check_return_val(environment && name, tb_false);

    // empty? remove this environment variable
    if (!tb_vector_size(environment)) return !unsetenv(name);

    // init values string
    tb_string_t values;
    if (!tb_string_init(&values)) return tb_false;

    // make values string
    tb_for_all_if (tb_char_t const*, value, environment, value)
    {
        // the single value cannot exist ':'
        tb_assertf(!tb_strchr(value, ':'), "invalid value: %s", value);

        // append value
        tb_string_cstrcat(&values, value);
        tb_string_chrcat(&values, ':');
    }
示例#4
0
tb_bool_t tb_environment_add(tb_char_t const* name, tb_char_t const* values, tb_bool_t to_head)
{
    // check
    tb_assert_and_check_return_val(name && values, tb_false);

    // find the first separator position
    tb_bool_t ok = tb_false;
    tb_char_t const* p = values? tb_strchr(values, TM_ENVIRONMENT_SEP) : tb_null;
    if (p)
    {
        // init filter
        tb_hash_set_ref_t filter = tb_hash_set_init(8, tb_element_str(tb_true));

        // init environment 
        tb_char_t               data[TB_PATH_MAXN];
        tb_environment_ref_t    environment = tb_environment_init();
        if (environment)
        {
            // load the previous values
            tb_environment_load(environment, name);

            // make environment
            tb_char_t const* b = values;
            tb_char_t const* e = b + tb_strlen(values);
            do
            {
                // not empty?
                if (b < p)
                {
                    // the size
                    tb_size_t size = tb_min(p - b, sizeof(data) - 1);

                    // copy it
                    tb_strncpy(data, b, size);
                    data[size] = '\0';

                    // have been not inserted?
                    if (!filter || !tb_hash_set_get(filter, data)) 
                    {
                        // append the environment 
                        tb_environment_insert(environment, data, to_head);

                        // save it to the filter
                        tb_hash_set_insert(filter, data);
                    }
                }

                // end?
                tb_check_break(p + 1 < e);

                // find the next separator position
                b = p + 1;
                p = tb_strchr(b, TM_ENVIRONMENT_SEP);
                if (!p) p = e;

            } while (1);

            // set environment variables
            ok = tb_environment_save(environment, name);

            // exit environment
            tb_environment_exit(environment);
        }

        // exit filter
        if (filter) tb_hash_set_exit(filter);
        filter = tb_null;
    }
    // only one?
    else
    {
        // set environment variables
        tb_environment_ref_t environment = tb_environment_init();
        if (environment)
        {
            // load the previous values
            tb_environment_load(environment, name);

            // append the environment 
            tb_environment_insert(environment, values, to_head);

            // set environment variables
            ok = tb_environment_save(environment, name);

            // exit environment
            tb_environment_exit(environment);
        }
    }

    // ok?
    return ok;
}
示例#5
0
文件: spider.c 项目: ahnan4arch/tbox
static tb_bool_t tb_demo_spider_init(tb_demo_spider_t* spider, tb_int_t argc, tb_char_t** argv)
{
    // check
    tb_assert_and_check_return_val(spider && argc && argv, tb_false);

    // done
    tb_bool_t ok = tb_false;
    do
    {
#ifdef TB_CONFIG_MODULE_HAVE_OBJECT
        // init option
        spider->option = tb_option_init("spider", "the spider demo", g_options);
        tb_assert_and_check_break(spider->option);
 
        // done option
        if (!tb_option_done(spider->option, argc - 1, &argv[1])) break;

        // check
        tb_assert_and_check_break(tb_option_find(spider->option, "home"));

        // init home
        if (!tb_url_init(&spider->home)) break;
        tb_url_set(&spider->home, tb_option_item_cstr(spider->option, "home"));
        tb_trace_d("home: %s", tb_url_get(&spider->home));

        // init only home?
        if (tb_option_find(spider->option, "only"))
            spider->home_only = tb_option_item_bool(spider->option, "only");

        // init root
        tb_char_t const* root = tb_option_item_cstr(spider->option, "directory");

        // init user agent
        spider->user_agent = tb_option_item_cstr(spider->option, "agent");

        // init timeout
        if (tb_option_find(spider->option, "timeout"))
            spider->timeout = tb_option_item_sint32(spider->option, "timeout");

        // init limited rate
        if (tb_option_find(spider->option, "rate"))
            spider->limited_rate = tb_option_item_uint32(spider->option, "rate");
#else

        // check
        tb_assert_and_check_break(argv[1]);

        // init home
        if (!tb_url_init(&spider->home)) break;
        spider->home = argv[1]? argv[1] : tb_null;
        tb_trace_d("home: %s", tb_url_get(&spider->home));

        // init root
        tb_char_t const* root = argv[2];
#endif

        // the home host
        tb_char_t const* host = tb_url_host_get(&spider->home);
        tb_assert_and_check_break(host);

        // init home domain
        tb_char_t const* domain = tb_strchr(host, '.');
        if (domain)
        {
            tb_strlcpy(spider->home_domain, domain, sizeof(spider->home_domain) - 1);
            spider->home_domain[sizeof(spider->home_domain) - 1] = '\0';
        }

        // using the default root
        if (root) tb_strlcpy(spider->root, root, sizeof(spider->root) - 1);
        else 
        {
            // the temporary root
            tb_directory_temp(spider->root, sizeof(spider->root) - 1);

            // append spider
            tb_strcat(spider->root, "/spider");
        }
        tb_trace_d("root: %s", spider->root);

        // using the default user agent
        if (!spider->user_agent) spider->user_agent = TB_DEMO_SPIDER_USER_AGENT;

        // using the default timeout
        if (!spider->timeout) spider->timeout = TB_DEMO_SPIDER_TASK_TIMEOUT;

        // using the default rate
        if (!spider->limited_rate) spider->limited_rate = TB_DEMO_SPIDER_TASK_RATE;

        // strip root tail: '/' or '\\'
        tb_size_t size = tb_strlen(spider->root);
        if (size && (spider->root[size - 1] == '/' || spider->root[size - 1] == '\\')) spider->root[size - 1] = '\0';

        // init state
        spider->state = TB_STATE_OK;

        // init lock
        if (!tb_spinlock_init(&spider->lock)) break;

        // init pool
        spider->pool = tb_fixed_pool_init(tb_null, TB_DEMO_SPIDER_TASK_MAXN >> 2, sizeof(tb_demo_spider_task_t), tb_null, tb_null, tb_null);
        tb_assert_and_check_break(spider->pool);

        // init filter
        spider->filter = tb_bloom_filter_init(TB_BLOOM_FILTER_PROBABILITY_0_001, 3, TB_DEMO_SPIDER_FILTER_MAXN, tb_item_func_str(tb_true));
        tb_assert_and_check_break(spider->filter);

        // register lock profiler
#ifdef TB_LOCK_PROFILER_ENABLE
        tb_lock_profiler_register(tb_lock_profiler(), (tb_pointer_t)&spider->lock, "spider");
#endif

        // ok
        ok = tb_true;

    } while (0);

    // failed? help it
#ifdef TB_CONFIG_MODULE_HAVE_OBJECT
    if (!ok && spider->option) tb_option_help(spider->option);
#endif

    // ok?
    return ok;
}