static void test_iri_compare(void) { static const struct iri_test_data { const char *url1, *url2; int result; } test_data[] = { { "http://abc.com", "http://abc.com/", -1}, // different, some web servers redirect ... to .../ due to normalization issues { "http://abc.com", "http://abc.com:", 0}, { "http://abc.com", "http://abc.com:/", -1}, { "http://abc.com", "http://abc.com:80/", -1}, { "http://abc.com", "http://abc.com:80//", -1}, // { "http://äöü.com", "http://ÄÖÜ.com:80//", 0}, { "http://abc.com:80/~smith/home.html", "http://abc.com/~smith/home.html", 0}, { "http://abc.com:80/~smith/home.html", "http://ABC.com/~smith/home.html", 0}, { "http://abc.com:80/~smith/home.html", "http://ABC.com/%7Esmith/home.html", 0}, { "http://abc.com:80/~smith/home.html", "http://ABC.com/%7esmith/home.html", 0}, { "http://ABC.com/%7esmith/home.html", "http://ABC.com/%7Esmith/home.html", 0}, { "http://ABC.com/%7esmith/home.html", "http://ACB.com/%7Esmith/home.html", -1} }; unsigned it; int n; for (it = 0; it < countof(test_data); it++) { const struct iri_test_data *t = &test_data[it]; mget_iri_t *iri1 = mget_iri_parse(t->url1, "utf-8"); mget_iri_t *iri2 = mget_iri_parse(t->url2, "utf-8"); n = mget_iri_compare(iri1, iri2); if (n < -1) n = -1; else if (n > 1) n = 1; if (n == t->result) ok++; else { failed++; info_printf("Failed [%u]: compare(%s,%s) -> %d (expected %d)\n", it, t->url1, t->url2, n, t->result); printf(" display %s / %s\n", iri1->display, iri2->display); printf(" scheme %s / %s\n", iri1->scheme, iri2->scheme); printf(" user %s / %s\n", iri1->userinfo, iri2->userinfo); printf(" host %s / %s\n", iri1->host, iri2->host); printf(" port %s / %s\n", iri1->port, iri2->port); printf(" path %s / %s\n", iri1->path, iri2->path); printf(" query %s / %s\n", iri1->query, iri2->query); printf(" fragment %s / %s\n", iri1->fragment, iri2->fragment); printf("\n"); } mget_iri_free(&iri2); mget_iri_free(&iri1); } }
void queue_del(JOB *job) { if (job) { debug_printf("queue_del %p\n", (void *)job); // special handling for automatic robots.txt jobs if (job->deferred) { JOB new_job = { .iri = NULL }; if (job->host) job->host->robot_job = NULL; mget_iri_free(&job->iri); // create a job for each deferred IRI for (int it = 0; it < mget_vector_size(job->deferred); it++) { new_job.iri = mget_vector_get(job->deferred, it); new_job.local_filename = get_local_filename(new_job.iri); queue_add_job(&new_job); } } job_free(job); mget_thread_mutex_lock(&mutex); mget_list_remove(&queue, job); mget_thread_mutex_unlock(&mutex); }
mget_iri_t *blacklist_add(mget_iri_t *iri) { if (!iri) return NULL; if (mget_iri_supported(iri)) { mget_thread_mutex_lock(&mutex); if (!blacklist) { blacklist = mget_hashmap_create(128, -2, (unsigned int(*)(const void *))hash_iri, (int(*)(const void *, const void *))mget_iri_compare); mget_hashmap_set_destructor(blacklist, (void(*)(void *, void *))_free_entry); } if (!mget_hashmap_contains(blacklist, iri)) { // info_printf("Add to blacklist: %s\n",iri->uri); mget_hashmap_put_noalloc(blacklist, iri, NULL); // use hashmap as a hashset (without value) mget_thread_mutex_unlock(&mutex); return iri; } mget_thread_mutex_unlock(&mutex); } mget_iri_free(&iri); return NULL; }
static void test_cookies(void) { static const struct test_data { const char *uri, *set_cookie, *name, *value, *domain, *path, *expires; unsigned int domain_dot : 1, // for compatibility with Netscape cookie format normalized : 1, persistent : 1, host_only : 1, secure_only : 1, // cookie should be used over secure connections only (TLS/HTTPS) http_only : 1; // just use the cookie via HTTP/HTTPS protocol int result, psl_result; } test_data[] = { { // allowed cookie "www.example.com", "ID=65=abcd; expires=Tuesday, 07-May-2013 07:48:53 GMT; path=/; domain=.example.com; HttpOnly", "ID", "65=abcd", "example.com", "/", "Tue, 07 May 2013 07:48:53 GMT", 1, 1, 1, 0, 0, 1, 0, 0 }, { // allowed cookie ANSI C's asctime format "www.example.com", "ID=65=abcd; expires=Tue May 07 07:48:53 2013; path=/; domain=.example.com", "ID", "65=abcd", "example.com", "/", "Tue, 07 May 2013 07:48:53 GMT", 1, 1, 1, 0, 0, 0, 0, 0 }, { // allowed cookie without path "www.example.com", "ID=65=abcd; expires=Tue, 07-May-2013 07:48:53 GMT; domain=.example.com", "ID", "65=abcd", "example.com", "/", "Tue, 07 May 2013 07:48:53 GMT", 1, 1, 1, 0, 0, 0, 0, 0 }, { // allowed cookie without domain "www.example.com", "ID=65=abcd; expires=Tue, 07-May-2013 07:48:53 GMT; path=/", "ID", "65=abcd", "www.example.com", "/", "Tue, 07 May 2013 07:48:53 GMT", 0, 1, 1, 1, 0, 0, 0, 0 }, { // allowed cookie without domain, path and expires "www.example.com", "ID=65=abcd", "ID", "65=abcd", "www.example.com", "/", "Tue, 07 May 2013 07:48:53 GMT", 0, 1, 0, 1, 0, 0, 0, 0 }, { // illegal cookie "www.example.com", "ID=65=abcd; expires=Tue, 07-May-2013 07:48:53 GMT; path=/; domain=.example.org", "ID", "65=abcd", "example.org", "/", "Tue, 07 May 2013 07:48:53 GMT", 1, 0, 1, 0, 0, 0, -1, 0 }, #ifdef WITH_LIBPSL { // supercookie, accepted by normalization (rule 'com') but not by mget_cookie_check_psl()) "www.example.com", "ID=65=abcd; expires=Mon, 29-Feb-2016 07:48:54 GMT; path=/; domain=.com; HttpOnly; Secure", "ID", "65=abcd", "com", "/", "Mon, 29 Feb 2016 07:48:54 GMT", 1, 0, 1, 0, 1, 1, 0, -1 }, { // supercookie, accepted by normalization (rule '*.ar') but not by mget_cookie_check_psl()) "www.sa.gov.au", "ID=65=abcd; expires=Tue, 29-Feb-2000 07:48:55 GMT; path=/; domain=.sa.gov.au", "ID", "65=abcd", "sa.gov.au", "/", "Tue, 29 Feb 2000 07:48:55 GMT", 1, 0, 1, 0, 0, 0, 0, -1 }, #endif { // exception rule '!educ.ar', accepted by normalization "www.educ.ar", "ID=65=abcd; path=/; domain=.educ.ar", "ID", "65=abcd", "educ.ar", "/", NULL, 1, 1, 0, 0, 0, 0, 0, 0 }, }; mget_cookie_t cookie; mget_cookie_db_t *cookies; mget_iri_t *iri; unsigned it; int result, result_psl; cookies = mget_cookie_db_init(NULL); mget_cookie_db_load_psl(cookies, DATADIR "/effective_tld_names.dat"); for (it = 0; it < countof(test_data); it++) { const struct test_data *t = &test_data[it]; char thedate[32], *header; iri = mget_iri_parse(t->uri, "utf-8"); mget_http_parse_setcookie(t->set_cookie, &cookie); if ((result = mget_cookie_normalize(iri, &cookie)) != t->result) { failed++; info_printf("Failed [%u]: normalize_cookie(%s) -> %d (expected %d)\n", it, t->set_cookie, result, t->result); mget_cookie_deinit(&cookie); goto next; } else { if ((result_psl = mget_cookie_check_psl(cookies, &cookie)) != t->psl_result) { failed++; info_printf("Failed [%u]: PSL check(%s) -> %d (expected %d)\n", it, t->set_cookie, result_psl, t->psl_result); } mget_cookie_deinit(&cookie); goto next; } if (cookie.expires) { mget_http_print_date(cookie.expires, thedate, sizeof(thedate)); if (strcmp(thedate, t->expires)) { failed++; info_printf("Failed [%u]: expires mismatch: '%s' != '%s' (time_t %lld)\n", it, thedate, t->expires, (long long)cookie.expires); mget_cookie_deinit(&cookie); goto next; } } if (strcmp(cookie.name, t->name) || strcmp(cookie.value, t->value) || strcmp(cookie.domain, t->domain) || strcmp(cookie.path, t->path) || cookie.domain_dot != t->domain_dot || cookie.normalized != t->normalized || cookie.persistent != t->persistent || cookie.host_only != t->host_only || cookie.secure_only != t->secure_only || cookie.http_only != t->http_only) { failed++; info_printf("Failed [%u]: cookie (%s) differs:\n", it, t->set_cookie); if (strcmp(cookie.name, t->name)) info_printf(" name %s (expected %s)\n", cookie.name, t->name); if (strcmp(cookie.value, t->value)) info_printf(" value %s (expected %s)\n", cookie.value, t->value); if (strcmp(cookie.domain, t->domain)) info_printf(" domain %s (expected %s)\n", cookie.domain, t->domain); if (strcmp(cookie.path, t->path)) info_printf(" path %s (expected %s)\n", cookie.path, t->path); if (cookie.domain_dot != t->domain_dot) info_printf(" domain_dot %d (expected %d)\n", cookie.domain_dot, t->domain_dot); if (cookie.normalized != t->normalized) info_printf(" normalized %d (expected %d)\n", cookie.normalized, t->normalized); if (cookie.persistent != t->persistent) info_printf(" persistent %d (expected %d)\n", cookie.persistent, t->persistent); if (cookie.host_only != t->host_only) info_printf(" host_only %d (expected %d)\n", cookie.host_only, t->host_only); if (cookie.secure_only != t->secure_only) info_printf(" secure_only %d (expected %d)\n", cookie.secure_only, t->secure_only); if (cookie.http_only != t->http_only) info_printf(" http_only %d (expected %d)\n", cookie.http_only, t->http_only); mget_cookie_deinit(&cookie); goto next; } mget_cookie_store_cookie(cookies, &cookie); info_printf("%s\n", header = mget_cookie_create_request_header(cookies, iri)); xfree(header); ok++; next: mget_iri_free(&iri); } mget_cookie_db_free(&cookies); }
static void test_iri_relative_to_absolute(void) { static const struct iri_test_data { const char *base, *relative, *result; } test_data[] = { #define H1 "http://x.tld" { H1, "", H1"/" }, { H1, ".", H1"/" }, { H1, "./", H1"/" }, { H1, "..", H1"/" }, { H1, "../", H1"/" }, { H1, "foo", H1"/foo" }, { H1, "foo/bar", H1"/foo/bar" }, { H1, "foo///bar", H1"/foo/bar" }, { H1, "foo/.", H1"/foo/" }, { H1, "foo/./", H1"/foo/" }, { H1, "foo./", H1"/foo./" }, { H1, "foo/../bar", H1"/bar" }, { H1, "foo/../bar/", H1"/bar/" }, { H1, "foo/bar/..", H1"/foo/" }, { H1, "foo/bar/../x", H1"/foo/x" }, { H1, "foo/bar/../x/", H1"/foo/x/" }, { H1, "foo/..", H1"/" }, { H1, "foo/../..", H1"/" }, { H1, "foo/../../..", H1"/" }, { H1, "foo/../../bar/../../baz", H1"/baz" }, { H1, "a/b/../../c", H1"/c" }, { H1, "./a/../b", H1"/b" }, { H1, "/", H1"/" }, { H1, "/.", H1"/" }, { H1, "/./", H1"/" }, { H1, "/..", H1"/" }, { H1, "/../", H1"/" }, { H1, "/foo", H1"/foo" }, { H1, "/foo/bar", H1"/foo/bar" }, { H1, "/foo///bar", H1"/foo/bar" }, { H1, "/foo/.", H1"/foo/" }, { H1, "/foo/./", H1"/foo/" }, { H1, "/foo./", H1"/foo./" }, { H1, "/foo/../bar", H1"/bar" }, { H1, "/foo/../bar/", H1"/bar/" }, { H1, "/foo/bar/..", H1"/foo/" }, { H1, "/foo/bar/../x", H1"/foo/x" }, { H1, "/foo/bar/../x/", H1"/foo/x/" }, { H1, "/foo/..", H1"/" }, { H1, "/foo/../..", H1"/" }, { H1, "/foo/../../..", H1"/" }, { H1, "/foo/../../bar/../../baz", H1"/baz" }, { H1, "/a/b/../../c", H1"/c" }, { H1, "/./a/../b", H1"/b" }, { H1, ".x", H1"/.x" }, { H1, "..x", H1"/..x" }, { H1, "foo/.x", H1"/foo/.x" }, { H1, "foo/bar/.x", H1"/foo/bar/.x" }, { H1, "foo/..x", H1"/foo/..x" }, { H1, "foo/bar/..x", H1"/foo/bar/..x" }, { H1, "/x.php?y=ftp://example.com/&z=1_2", H1"/x.php?y=ftp://example.com/&z=1_2" }, { H1, "//x.y.com/", "http://x.y.com/" }, { H1, "http://x.y.com/", "http://x.y.com/" }, // { H1, "site;sub:.html", H1"/site;sub:.html" }, #undef H1 #define H1 "http://x.tld/" { H1, "", H1"" }, { H1, ".", H1"" }, { H1, "./", H1"" }, { H1, "..", H1"" }, { H1, "../", H1"" }, { H1, "foo", H1"foo" }, { H1, "foo/bar", H1"foo/bar" }, { H1, "foo///bar", H1"foo/bar" }, { H1, "foo/.", H1"foo/" }, { H1, "foo/./", H1"foo/" }, { H1, "foo./", H1"foo./" }, { H1, "foo/../bar", H1"bar" }, { H1, "foo/../bar/", H1"bar/" }, { H1, "foo/bar/..", H1"foo/" }, { H1, "foo/bar/../x", H1"foo/x" }, { H1, "foo/bar/../x/", H1"foo/x/" }, { H1, "foo/..", H1"" }, { H1, "foo/../..", H1"" }, { H1, "foo/../../..", H1"" }, { H1, "foo/../../bar/../../baz", H1"baz" }, { H1, "a/b/../../c", H1"c" }, { H1, "./a/../b", H1"b" }, { H1, "/", H1"" }, { H1, "/.", H1"" }, { H1, "/./", H1"" }, { H1, "/..", H1"" }, { H1, "/../", H1"" }, { H1, "/foo", H1"foo" }, { H1, "/foo/bar", H1"foo/bar" }, { H1, "/foo///bar", H1"foo/bar" }, { H1, "/foo/.", H1"foo/" }, { H1, "/foo/./", H1"foo/" }, { H1, "/foo./", H1"foo./" }, { H1, "/foo/../bar", H1"bar" }, { H1, "/foo/../bar/", H1"bar/" }, { H1, "/foo/bar/..", H1"foo/" }, { H1, "/foo/bar/../x", H1"foo/x" }, { H1, "/foo/bar/../x/", H1"foo/x/" }, { H1, "/foo/..", H1"" }, { H1, "/foo/../..", H1"" }, { H1, "/foo/../../..", H1"" }, { H1, "/foo/../../bar/../../baz", H1"baz" }, { H1, "/a/b/../../c", H1"c" }, { H1, "/./a/../b", H1"b" }, { H1, ".x", H1".x" }, { H1, "..x", H1"..x" }, { H1, "foo/.x", H1"foo/.x" }, { H1, "foo/bar/.x", H1"foo/bar/.x" }, { H1, "foo/..x", H1"foo/..x" }, { H1, "foo/bar/..x", H1"foo/bar/..x" }, { H1, "/x.php?y=ftp://example.com/&z=1_2", H1"x.php?y=ftp://example.com/&z=1_2" }, { H1, "//x.y.com/", "http://x.y.com/" }, { H1, "http://x.y.com/", "http://x.y.com/" }, #undef H1 #define H1 "http://x.tld/file" #define R1 "http://x.tld/" { H1, "", R1"" }, { H1, ".", R1"" }, { H1, "./", R1"" }, { H1, "..", R1"" }, { H1, "../", R1"" }, { H1, "foo", R1"foo" }, { H1, "foo/bar", R1"foo/bar" }, { H1, "foo///bar", R1"foo/bar" }, { H1, "foo/.", R1"foo/" }, { H1, "foo/./", R1"foo/" }, { H1, "foo./", R1"foo./" }, { H1, "foo/../bar", R1"bar" }, { H1, "foo/../bar/", R1"bar/" }, { H1, "foo/bar/..", R1"foo/" }, { H1, "foo/bar/../x", R1"foo/x" }, { H1, "foo/bar/../x/", R1"foo/x/" }, { H1, "foo/..", R1"" }, { H1, "foo/../..", R1"" }, { H1, "foo/../../..", R1"" }, { H1, "foo/../../bar/../../baz", R1"baz" }, { H1, "a/b/../../c", R1"c" }, { H1, "./a/../b", R1"b" }, { H1, "/", R1"" }, { H1, "/.", R1"" }, { H1, "/./", R1"" }, { H1, "/..", R1"" }, { H1, "/../", R1"" }, { H1, "/foo", R1"foo" }, { H1, "/foo/bar", R1"foo/bar" }, { H1, "/foo///bar", R1"foo/bar" }, { H1, "/foo/.", R1"foo/" }, { H1, "/foo/./", R1"foo/" }, { H1, "/foo./", R1"foo./" }, { H1, "/foo/../bar", R1"bar" }, { H1, "/foo/../bar/", R1"bar/" }, { H1, "/foo/bar/..", R1"foo/" }, { H1, "/foo/bar/../x", R1"foo/x" }, { H1, "/foo/bar/../x/", R1"foo/x/" }, { H1, "/foo/..", R1"" }, { H1, "/foo/../..", R1"" }, { H1, "/foo/../../..", R1"" }, { H1, "/foo/../../bar/../../baz", R1"baz" }, { H1, "/a/b/../../c", R1"c" }, { H1, "/./a/../b", R1"b" }, { H1, ".x", R1".x" }, { H1, "..x", R1"..x" }, { H1, "foo/.x", R1"foo/.x" }, { H1, "foo/bar/.x", R1"foo/bar/.x" }, { H1, "foo/..x", R1"foo/..x" }, { H1, "foo/bar/..x", R1"foo/bar/..x" }, { H1, "/x.php?y=ftp://example.com/&z=1_2", R1"x.php?y=ftp://example.com/&z=1_2" }, { H1, "//x.y.com/", "http://x.y.com/" }, { H1, "http://x.y.com/", "http://x.y.com/" }, #undef H1 #undef R1 #define H1 "http://x.tld/dir/" #define R1 "http://x.tld/" { H1, "", H1"" }, { H1, ".", H1"" }, { H1, "./", H1"" }, { H1, "..", R1"" }, { H1, "../", R1"" }, { H1, "foo", H1"foo" }, { H1, "foo/bar", H1"foo/bar" }, { H1, "foo///bar", H1"foo/bar" }, { H1, "foo/.", H1"foo/" }, { H1, "foo/./", H1"foo/" }, { H1, "foo./", H1"foo./" }, { H1, "foo/../bar", H1"bar" }, { H1, "foo/../bar/", H1"bar/" }, { H1, "foo/bar/..", H1"foo/" }, { H1, "foo/bar/../x", H1"foo/x" }, { H1, "foo/bar/../x/", H1"foo/x/" }, { H1, "foo/..", H1"" }, { H1, "foo/../..", R1"" }, { H1, "foo/../../..", R1"" }, { H1, "foo/../../bar/../../baz", R1"baz" }, { H1, "a/b/../../c", H1"c" }, { H1, "./a/../b", H1"b" }, { H1, "/", R1"" }, { H1, "/.", R1"" }, { H1, "/./", R1"" }, { H1, "/..", R1"" }, { H1, "/../", R1"" }, { H1, "/foo", R1"foo" }, { H1, "/foo/bar", R1"foo/bar" }, { H1, "/foo///bar", R1"foo/bar" }, { H1, "/foo/.", R1"foo/" }, { H1, "/foo/./", R1"foo/" }, { H1, "/foo./", R1"foo./" }, { H1, "/foo/../bar", R1"bar" }, { H1, "/foo/../bar/", R1"bar/" }, { H1, "/foo/bar/..", R1"foo/" }, { H1, "/foo/bar/../x", R1"foo/x" }, { H1, "/foo/bar/../x/", R1"foo/x/" }, { H1, "/foo/..", R1"" }, { H1, "/foo/../..", R1"" }, { H1, "/foo/../../..", R1"" }, { H1, "/foo/../../bar/../../baz", R1"baz" }, { H1, "/a/b/../../c", R1"c" }, { H1, "/./a/../b", R1"b" }, { H1, ".x", H1".x" }, { H1, "..x", H1"..x" }, { H1, "foo/.x", H1"foo/.x" }, { H1, "foo/bar/.x", H1"foo/bar/.x" }, { H1, "foo/..x", H1"foo/..x" }, { H1, "foo/bar/..x", H1"foo/bar/..x" }, { H1, "/x.php?y=ftp://example.com/&z=1_2", R1"x.php?y=ftp://example.com/&z=1_2" }, { H1, "//x.y.com/", "http://x.y.com/" }, { H1, "http://x.y.com/", "http://x.y.com/" } #undef H1 #undef R1 }; unsigned it; char uri_buf_static[32]; // use a size that forces allocation in some cases mget_buffer_t *uri_buf = mget_buffer_init(NULL, uri_buf_static, sizeof(uri_buf_static)); mget_iri_t *base; for (it = 0; it < countof(test_data); it++) { const struct iri_test_data *t = &test_data[it]; base = mget_iri_parse(t->base, "utf-8"); mget_iri_relative_to_abs(base, t->relative, strlen(t->relative), uri_buf); if (!strcmp(uri_buf->data, t->result)) ok++; else { failed++; info_printf("Failed [%u]: %s+%s -> %s (expected %s)\n", it, t->base, t->relative, uri_buf->data, t->result); } mget_iri_free(&base); } mget_buffer_free(&uri_buf); }
static void test_iri_parse(void) { const struct iri_test_data { const char *uri, *display, *scheme, *userinfo, *password, *host, *port, *path, *query, *fragment; } test_data[] = { { "1.2.3.4", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "1.2.3.4", NULL, NULL, NULL, NULL}, { "1.2.3.4:987", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "1.2.3.4", "987", NULL, NULL, NULL}, { "//example.com/thepath", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, "thepath", NULL, NULL}, // { "///thepath", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, NULL, NULL, "thepath", NULL, NULL}, { "example.com", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, NULL, NULL, NULL}, { "example.com:555", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", "555", NULL, NULL, NULL}, { "http://example.com", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, NULL, NULL, NULL}, { "http://example.com:", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, NULL, NULL, NULL}, { "http://example.com:/", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, "", NULL, NULL}, { "http://example.com:80/", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, "", NULL, NULL}, { "https://example.com", NULL, MGET_IRI_SCHEME_HTTPS, NULL, NULL, "example.com", NULL, NULL, NULL, NULL}, { "https://example.com:443", NULL, MGET_IRI_SCHEME_HTTPS, NULL, NULL, "example.com", NULL, NULL, NULL, NULL}, { "https://example.com:444", NULL, MGET_IRI_SCHEME_HTTPS, NULL, NULL, "example.com", "444", NULL, NULL, NULL}, { "http://example.com:80", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, NULL, NULL, NULL}, { "http://example.com:81", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", "81", NULL, NULL, NULL}, { "http://example.com/index.html", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, "index.html", NULL, NULL}, { "http://example.com/index.html?query#frag", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, "index.html", "query", "frag"}, { "http://example.com/index.html?#", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "example.com", NULL, "index.html", "", ""}, { "碼標準萬國碼.com", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "xn--9cs565brid46mda086o.com", NULL, NULL, NULL, NULL}, // { "ftp://cnn.example.com&[email protected]/top_story.htm", NULL,"ftp",NULL,NULL,"cnn.example.com",NULL,NULL,"[email protected]/top_story.htm",NULL } { "ftp://[email protected]/top_story.htm", NULL, "ftp", NULL, NULL, "cnn.example.com", NULL, NULL, "[email protected]/top_story.htm", NULL}, // { "site;sub:.html", NULL, MGET_IRI_SCHEME_HTTP, NULL, NULL, "site", NULL, ";sub:.html", NULL, NULL}, }; unsigned it; for (it = 0; it < countof(test_data); it++) { const struct iri_test_data *t = &test_data[it]; mget_iri_t *iri = mget_iri_parse(t->uri, "utf-8"); if (mget_strcmp(iri->display, t->display) || mget_strcmp(iri->scheme, t->scheme) || mget_strcmp(iri->userinfo, t->userinfo) || mget_strcmp(iri->password, t->password) || mget_strcmp(iri->host, t->host) || mget_strcmp(iri->port, t->port) || mget_strcmp(iri->path, t->path) || mget_strcmp(iri->query, t->query) || mget_strcmp(iri->fragment, t->fragment)) { failed++; printf("IRI test #%u failed:\n", it + 1); printf(" [%s]\n", iri->uri); printf(" display %s (expected %s)\n", iri->display, t->display); printf(" scheme %s (expected %s)\n", iri->scheme, t->scheme); printf(" user %s (expected %s)\n", iri->userinfo, t->userinfo); printf(" host %s (expected %s)\n", iri->host, t->host); printf(" port %s (expected %s)\n", iri->port, t->port); printf(" path %s (expected %s)\n", iri->path, t->path); printf(" query %s (expected %s)\n", iri->query, t->query); printf(" fragment %s (expected %s)\n", iri->fragment, t->fragment); printf("\n"); } else { ok++; } mget_iri_free(&iri); } }
static void test_stringmap(void) { mget_stringmap_t *m; char key[128], value[128], *val; int run, it; size_t valuesize; // the initial size of 16 forces the internal reshashing function to be called twice m = mget_stringmap_create(16); for (run = 0; run < 2; run++) { if (run) { mget_stringmap_clear(m); mget_stringmap_sethashfunc(m, hash_txt); } for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); valuesize = sprintf(value, "%d.html", it); if (mget_stringmap_put(m, key, value, valuesize + 1)) { failed++; info_printf("stringmap_put(%s) returns unexpected old value\n", key); } else ok++; } if ((it = mget_stringmap_size(m)) != 26) { failed++; info_printf("stringmap_size() returned %d (expected %d)\n", it, 26); } else ok++; // now, look up every single entry for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); sprintf(value, "%d.html", it); if (!(val = mget_stringmap_get(m, key))) { failed++; info_printf("stringmap_get(%s) didn't find entry\n", key); } else if (strcmp(val, value)) { failed++; info_printf("stringmap_get(%s) found '%s' (expected '%s')\n", key, val, value); } else ok++; } mget_stringmap_clear(m); if ((it = mget_stringmap_size(m)) != 0) { failed++; info_printf("stringmap_size() returned %d (expected 0)\n", it); } else ok++; for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); valuesize = sprintf(value, "%d.html", it); if (mget_stringmap_put(m, key, value, valuesize + 1)) { failed++; info_printf("stringmap_put(%s) returns unexpected old value\n", key); } else ok++; } if ((it = mget_stringmap_size(m)) != 26) { failed++; info_printf("stringmap_size() returned %d (expected %d)\n", it, 26); } else ok++; // now, remove every single entry for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); sprintf(value, "%d.html", it); mget_stringmap_remove(m, key); } if ((it = mget_stringmap_size(m)) != 0) { failed++; info_printf("stringmap_size() returned %d (expected 0)\n", it); } else ok++; for (it = 0; it < 26; it++) { sprintf(key, "http://www.example.com/subdir/%d.html", it); valuesize = sprintf(value, "%d.html", it); if (mget_stringmap_put(m, key, value, valuesize + 1)) { failed++; info_printf("stringmap_put(%s) returns unexpected old value\n", key); } else ok++; } if ((it = mget_stringmap_size(m)) != 26) { failed++; info_printf("stringmap_size() returned %d (expected %d)\n", it, 26); } else ok++; } // testing alloc/free in stringmap/hashmap mget_stringmap_clear(m); mget_stringmap_put(m, "thekey", NULL, 0) ? failed++ : ok++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; mget_stringmap_put(m, "thekey", "thevalue", 9) ? ok++ : failed++; mget_stringmap_put(m, "thekey", "thevalue", 9) ? ok++ : failed++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; // testing key/value identity alloc/free in stringmap/hashmap mget_stringmap_clear(m); mget_stringmap_put(m, "thekey", NULL, 0) ? failed++ : ok++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; mget_stringmap_put(m, "thekey", "thevalue", 9) ? ok++ : failed++; mget_stringmap_put(m, "thekey", NULL, 0) ? ok++ : failed++; mget_stringmap_free(&m); mget_http_challenge_t challenge; mget_http_parse_challenge("Basic realm=\"test realm\"", &challenge); mget_http_free_challenge(&challenge); mget_vector_t *challenges; challenges = mget_vector_create(2, 2, NULL); mget_vector_set_destructor(challenges, (void(*)(void *))mget_http_free_challenge); mget_http_parse_challenge("Basic realm=\"test realm\"", &challenge); mget_vector_add(challenges, &challenge, sizeof(challenge)); mget_http_free_challenges(&challenges); char *response_text = strdup( "HTTP/1.1 401 Authorization Required\r\n"\ "Date: Sun, 23 Dec 2012 21:03:45 GMT\r\n"\ "Server: Apache/2.2.22 (Debian)\r\n"\ "WWW-Authenticate: Digest realm=\"therealm\", nonce=\"Ip6MaovRBAA=c4af733c51270698260f5d357724c2cbce20fa3d\", algorithm=MD5, domain=\"/prot_digest_md5\", qop=\"auth\"\r\n"\ "Vary: Accept-Encoding\r\n"\ "Content-Length: 476\r\n"\ "Keep-Alive: timeout=5, max=99\r\n"\ "Connection: Keep-Alive\r\n"\ "Content-Type: text/html; charset=iso-8859-1\r\n\r\n"); mget_iri_t *iri = mget_iri_parse("http://localhost/prot_digest_md5/", NULL); mget_http_request_t *req = mget_http_create_request(iri, "GET"); mget_http_response_t *resp = mget_http_parse_response_header(response_text); mget_http_add_credentials(req, mget_vector_get(resp->challenges, 0), "tim", "123"); // for (it=0;it<vec_size(req->lines);it++) { // info_printf("%s\n", (char *)vec_get(req->lines, it)); // } mget_http_free_response(&resp); mget_http_free_request(&req); mget_iri_free(&iri); xfree(response_text); // Authorization: Digest username="******", realm="therealm", nonce="Ip6MaovRBAA=c4af733c51270698260f5d357724c2cbce20fa3d", uri="/prot_digest_md5/", response="a99e2012d507a73dd46eb044d3f4641c", qop=auth, nc=00000001, cnonce="3d20faa1" }
mget_iri_t *mget_iri_parse(const char *url, const char *encoding) { mget_iri_t *iri; const char *default_port = NULL; char *p, *s, *authority, c; size_t slen, it; int url_allocated, maybe_scheme; if (!url) return NULL; /* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ while (isspace(*url)) url++; if (!*url) return NULL; // first unescape, than convert to UTF-8 if (strchr(url, '%')) { char *unesc_url = strdup(url); mget_percent_unescape(unesc_url); if (mget_str_needs_encoding(unesc_url)) { if ((url = mget_str_to_utf8(unesc_url, encoding))) xfree(unesc_url); else url = unesc_url; // on error, use what we have } else url = unesc_url; url_allocated = 1; } else { url_allocated = 0; if (mget_str_needs_encoding(url)) { if ((s = mget_str_to_utf8(url, encoding))) { url = s; url_allocated = 1; } } } // just use one block of memory for all parsed URI parts slen = strlen(url); iri = xmalloc(sizeof(mget_iri_t) + slen * 2 + 2); memset(iri, 0, sizeof(mget_iri_t)); strcpy(((char *)iri) + sizeof(mget_iri_t), url); iri->uri = ((char *)iri) + sizeof(mget_iri_t); s = ((char *)iri) + sizeof(mget_iri_t) + slen + 1; strcpy(s, url); if (url_allocated) xfree(url); p = s; if (isalpha(*p)) { maybe_scheme = 1; while (*s && !_iri_isgendelim(*s)) { if (maybe_scheme && !_iri_isscheme(*s)) maybe_scheme = 0; s++; } } else maybe_scheme = 0; if (maybe_scheme && (*s == ':' && (s[1] == '/' || s[1] == 0))) { // found a scheme *s++ = 0; // find the scheme in our static list of supported schemes // for later comparisons we compare pointers (avoiding strcasecmp()) iri->scheme = p; for (it = 0; mget_iri_schemes[it]; it++) { if (!mget_strcasecmp_ascii(mget_iri_schemes[it], p)) { iri->scheme = mget_iri_schemes[it]; default_port = iri_ports[it]; break; } } if (iri->scheme == p) { // convert scheme to lowercase mget_strtolower((char *)iri->scheme); } } else { iri->scheme = MGET_IRI_SCHEME_DEFAULT; default_port = iri_ports[0]; // port 80 s = p; // rewind } // this is true for http, https, ftp, file if (s[0] == '/' && s[1] == '/') s += 2; // authority authority = s; while (*s && *s != '/' && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; // left over: [path][?query][#fragment] if (c == '/') { iri->path = s; while (*s && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '?') { iri->query = s; while (*s && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '#') { iri->fragment = s; s += strlen(s); } if (*s) { debug_printf("unparsed rest '%s'\n", s); } if (*authority) { s = authority; p = strchr(authority, '@'); if (p) { iri->userinfo = s; *p = 0; s = p + 1; } if (*s == '[') { p = strrchr(s, ']'); if (p) { iri->host = s + 1; *p = 0; s = p + 1; } else { // something is broken iri->host = s + 1; s += strlen(s); } } else { iri->host = s; while (*s && *s != ':') s++; } if (*s == ':') { if (s[1]) { if (!default_port || (strcmp(s + 1, default_port) && atoi(s + 1) != atoi(default_port))) iri->port = s + 1; } } *s = 0; } iri->resolv_port = iri->port ? iri->port : default_port; // now unescape all components (not interested in display, userinfo, password) if (iri->host) { mget_strtolower((char *)iri->host); if ((p = (char *)mget_str_to_ascii(iri->host)) != iri->host) { iri->host = p; iri->host_allocated = 1; } } else { if (iri->scheme == MGET_IRI_SCHEME_HTTP || iri->scheme == MGET_IRI_SCHEME_HTTPS) { error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri); mget_iri_free(&iri); return NULL; } } /* debug_printf("scheme=%s\n",iri->scheme); debug_printf("host=%s\n",iri->host); debug_printf("path=%s\n",iri->path); debug_printf("query=%s\n",iri->query); debug_printf("fragment=%s\n",iri->fragment); */ return iri; }
int main(int argc, const char *const *argv) { // Base URI for converting relative to absolute URIs const char * base = "http://www.example.com"; // We assume that base is encoded in the local charset. const char * local_encoding = mget_local_charset_encoding(); // parsed 'base' mget_iri_t *base_uri; // Character encoding of CSS file content // An HTTP response may contain the encoding in the Content-Type header, // but if // see http://stackoverflow.com/questions/2526033/why-specify-charset-utf-8-in-your-css-file const char * css_encoding = NULL; int argpos; // We want the libmget error messages be printed to STDERR. // From here on, we can call mget_error_printf, etc. mget_logger_set_stream(mget_get_logger(MGET_LOGGER_ERROR), stderr); // We want the libmget info messages be printed to STDOUT. // From here on, we can call mget_info_printf, etc. mget_logger_set_stream(mget_get_logger(MGET_LOGGER_INFO), stdout); // parse options for (argpos = 1; argpos < argc; argpos++) { if (!strcmp(argv[argpos], "--base") && argc - argpos > 1) { base = argv[++argpos]; info_printf("Base URL encoding = '%s'\n", local_encoding); } else if (!strcmp(argv[argpos], "--encoding") && argc - argpos > 1) { css_encoding = argv[++argpos]; } else if (!strcmp(argv[argpos], "--")) { argpos++; break; } else if (argv[argpos][0] == '-') { usage(argv[0]); } else break; } // All URIs are converted into UTF-8 charset. // That's why we need the local encoding (aka 'encoding of base URI') here. base_uri = mget_iri_parse(base, local_encoding); for (; argpos < argc; argpos++) { // use '-' as filename for STDIN css_parse_localfile(argv[argpos], base_uri, css_encoding); } mget_iri_free(&base_uri); return 0; }
MGET_IRI *mget_iri_parse(const char *s_uri, const char *encoding) { MGET_IRI *iri; const char *default_port = NULL; char *p, *s, *authority, c; size_t slen, it; if (!s_uri) return NULL; /* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ while (isspace(*s_uri)) s_uri++; if (!*s_uri) return NULL; // just use one block of memory for all parsed URI parts slen = strlen(s_uri); iri = xmalloc(sizeof(MGET_IRI) + slen * 2 + 2); memset(iri, 0, sizeof(MGET_IRI)); strcpy(((char *)iri) + sizeof(MGET_IRI), s_uri); iri->uri = ((char *)iri) + sizeof(MGET_IRI); s = ((char *)iri) + sizeof(MGET_IRI) + slen + 1; strcpy(s, s_uri); p = s; while (*s && !_iri_isgendelim(*s)) s++; if (*s == ':' && s[1]=='/') { // found a scheme *s++ = 0; // find the scheme in our static list of supported schemes // for later comparisons we compare pointers (avoiding strcasecmnp()) iri->scheme = p; for (it = 0; iri_schemes[it]; it++) { if (!strcasecmp(iri_schemes[it], p)) { iri->scheme = iri_schemes[it]; default_port = iri_ports[it]; break; } } if (iri->scheme == p) { // convert scheme to lowercase for (; *p; p++) if (isupper(*p)) *p = tolower(*p); } } else { iri->scheme = IRI_SCHEME_DEFAULT; default_port = iri_ports[0]; // port 80 s = p; // rewind } // this is true for http, https, ftp, file if (s[0] == '/' && s[1] == '/') s += 2; // authority authority = s; while (*s && *s != '/' && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; // left over: [path][?query][#fragment] if (c == '/') { iri->path = s; while (*s && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '?') { iri->query = s; while (*s && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '#') { iri->fragment = s; while (*s) s++; } if (*s) { debug_printf("unparsed rest '%s'\n", s); } if (*authority) { s = authority; p = strchr(authority, '@'); if (p) { iri->userinfo = s; *p = 0; s = p + 1; } if (*s == '[') { p = strrchr(s, ']'); if (p) { iri->host = s + 1; *p = 0; s = p + 1; } else { // something is broken iri->host = s + 1; while (*s) s++; } } else { iri->host = s; while (*s && *s != ':') s++; } if (*s == ':') { if (s[1]) { if (!default_port || (strcmp(s + 1, default_port) && atoi(s + 1) != atoi(default_port))) iri->port = s + 1; } } *s = 0; /* for (p = (char *)iri->host; *p; p++) if (*p >= 'A' && *p <= 'Z') // isupper() also returns true for chars > 0x7f, the test is not EBCDIC compatible ;-) *p = tolower(*p); */ } iri->resolv_port = iri->port ? iri->port : default_port; // now unescape all components (not interested in display, userinfo, password if (iri->host) { const char *host_utf; char *p; _unescape((unsigned char *)iri->host); host_utf = mget_str_to_utf8(iri->host, encoding); if (host_utf) { char *host_asc = NULL; int rc; if ((rc = idna_to_ascii_8z(host_utf, &host_asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { // log_printf("toASCII '%s' -> '%s'\n", host_utf, host_asc); iri->host = host_asc; iri->host_allocated = 1; } else error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); xfree(host_utf); } for (p = (char *)iri->host; *p; p++) if (*p >= 'A' && *p <= 'Z') // isupper() also returns true for chars > 0x7f, the test is not EBCDIC compatible ;-) *p = tolower(*p); } else { if (iri->scheme == IRI_SCHEME_HTTP || iri->scheme == IRI_SCHEME_HTTPS) { error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri); mget_iri_free(&iri); return NULL; } } if (iri->path) _unescape((unsigned char *)iri->path); if (iri->query) _unescape((unsigned char *)iri->query); if (iri->fragment) _unescape((unsigned char *)iri->fragment); // info_printf("%s: path '%s'\n", iri->uri, iri->path); return iri; }