static void _rss_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED) { struct rss_context *ctx = context; wget_string_t url; if (!val || !len) return; url.p = NULL; if ((flags & XML_FLG_ATTRIBUTE)) { if (!wget_strcasecmp_ascii(attr, "url") || !wget_strcasecmp_ascii(attr, "href") || !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "domain") || !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6)) { for (;len && c_isspace(*val); val++, len--); // skip leading spaces for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces url.p = val; url.len = len; if (!ctx->urls) ctx->urls = wget_vector_create(32, -2, NULL); wget_vector_add(ctx->urls, &url, sizeof(url)); } } else if ((flags & XML_FLG_CONTENT)) { const char *elem = strrchr(dir, '/'); if (elem) { elem++; if (!wget_strcasecmp_ascii(elem, "guid") || !wget_strcasecmp_ascii(elem, "link") || !wget_strcasecmp_ascii(elem, "comments") || !wget_strcasecmp_ascii(elem, "docs")) { for (;len && c_isspace(*val); val++, len--); // skip leading spaces for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces // debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len); url.p = val; url.len = len; if (!ctx->urls) ctx->urls = wget_vector_create(32, -2, NULL); wget_vector_add(ctx->urls, &url, sizeof(url)); } } } }
static int check_piece_hash(wget_metalink_hash_t *hash, int fd, off_t offset, size_t length) { char sum[128 + 1]; // large enough for sha-512 hex if (wget_hash_file_fd(hash->type, fd, sum, sizeof(sum), offset, length) != -1) { return !wget_strcasecmp_ascii(sum, hash->hash_hex); } return -1; }
static int _check_file_fd(wget_metalink_hash_t *hash, int fd) { char sum[128 + 1]; // large enough for sha-512 hex if (wget_hash_file_fd(hash->type, fd, sum, sizeof(sum), 0, 0) != -1) { return !wget_strcasecmp_ascii(sum, hash->hash_hex); } return -1; }
char *wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding) { if (!src) return NULL; #ifdef HAVE_ICONV if (!src_encoding) src_encoding = "iso-8859-1"; // default character-set for most browsers if (!dst_encoding) dst_encoding = "iso-8859-1"; // default character-set for most browsers if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) { char *ret = NULL; iconv_t cd=iconv_open(dst_encoding, src_encoding); if (cd != (iconv_t)-1) { char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself size_t tmp_len = strlen(src); size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len; char *dst = xmalloc(dst_len + 1), *dst_tmp = dst; if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1 && iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1) { ret = wget_strmemdup(dst, dst_len - dst_len_tmp); debug_printf("converted '%s' (%s) -> '%s' (%s)\n", src, src_encoding, ret, dst_encoding); } else error_printf(_("Failed to convert '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno); xfree(dst); iconv_close(cd); } else error_printf(_("Failed to prepare encoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno); return ret; } #endif return strdup(src); }
static void test_parser(void) { DIR *dirp; struct dirent *dp; const char *ext; char fname[128]; int xml = 0, html = 0, css = 0; // test the XML / HTML parser, you should start the test with valgrind // to detect memory faults if ((dirp = opendir(SRCDIR "/files")) != NULL) { while ((dp = readdir(dirp)) != NULL) { if (*dp->d_name == '.') continue; if ((ext = strrchr(dp->d_name, '.'))) { snprintf(fname, sizeof(fname), SRCDIR "/files/%s", dp->d_name); if (!wget_strcasecmp_ascii(ext, ".xml")) { info_printf("parsing %s\n", fname); wget_xml_parse_file(fname, NULL, NULL, 0); xml++; } /* else if (!wget_strcasecmp_ascii(ext, ".html")) { info_printf("parsing %s\n", fname); wget_html_parse_file(fname, NULL, NULL, 0); html++; } else if (!wget_strcasecmp_ascii(ext, ".css")) { info_printf("parsing %s\n", fname); wget_css_parse_file(fname, _css_dump_uri, _css_dump_charset, NULL); css++; } */ } } closedir(dirp); } info_printf("%d XML, %d HTML and %d CSS files parsed\n", xml, html, css); }
/** * \param[in] hashname Name of the hashing algorithm (see table below) * \return A constant to be used by libwget hashing functions * * Get the hashing algorithms list item that corresponds to the named hashing algorithm. * * This function returns a constant that uniquely identifies a known supported hashing algorithm * within libwget. All the supported algorithms are listed in the ::wget_digest_algorithm_t enum. * * Algorithm name | Constant * -------------- | -------- * sha1 or sha-1|WGET_DIGTYPE_SHA1 * sha256 or sha-256|WGET_DIGTYPE_SHA256 * sha512 or sha-512|WGET_DIGTYPE_SHA512 * sha224 or sha-224|WGET_DIGTYPE_SHA224 * sha384 or sha-384|WGET_DIGTYPE_SHA384 * md5|WGET_DIGTYPE_MD5 * md2|WGET_DIGTYPE_MD2 * rmd160|WGET_DIGTYPE_RMD160 */ wget_digest_algorithm_t wget_hash_get_algorithm(const char *hashname) { if (hashname) { if (*hashname == 's' || *hashname == 'S') { if (!wget_strcasecmp_ascii(hashname, "sha-1") || !wget_strcasecmp_ascii(hashname, "sha1")) return WGET_DIGTYPE_SHA1; else if (!wget_strcasecmp_ascii(hashname, "sha-256") || !wget_strcasecmp_ascii(hashname, "sha256")) return WGET_DIGTYPE_SHA256; else if (!wget_strcasecmp_ascii(hashname, "sha-512") || !wget_strcasecmp_ascii(hashname, "sha512")) return WGET_DIGTYPE_SHA512; else if (!wget_strcasecmp_ascii(hashname, "sha-224") || !wget_strcasecmp_ascii(hashname, "sha224")) return WGET_DIGTYPE_SHA224; else if (!wget_strcasecmp_ascii(hashname, "sha-384") || !wget_strcasecmp_ascii(hashname, "sha384")) return WGET_DIGTYPE_SHA384; } else if (!wget_strcasecmp_ascii(hashname, "md5")) return WGET_DIGTYPE_MD5; else if (!wget_strcasecmp_ascii(hashname, "md2")) return WGET_DIGTYPE_MD2; else if (!wget_strcasecmp_ascii(hashname, "rmd160")) return WGET_DIGTYPE_RMD160; } error_printf(_("Unknown hash type '%s'\n"), hashname); return WGET_DIGTYPE_UNKNOWN; }
static void html_parse_localfile(const char *fname) { char *data; const char *encoding = NULL; size_t len; if ((data = wget_read_file(fname, &len))) { if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) { // Big-endian UTF-16 encoding = "UTF-16BE"; // adjust behind BOM, ignore trailing single byte data += 2; len -= 2; } else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) { // Little-endian UTF-16 encoding = "UTF-16LE"; // adjust behind BOM data += 2; len -= 2; } else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) { // UTF-8 encoding = "UTF-8"; // adjust behind BOM data += 3; len -= 3; } if (encoding) printf("URI encoding '%s' set by BOM\n", encoding); if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) { size_t n; char *utf8; len -= len & 1; // ignore single trailing byte, else charset conversion fails if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) { printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding); data = utf8; } else { printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding); return; } } WGET_HTML_PARSED_RESULT *res = wget_html_get_urls_inline(data, NULL, NULL); if (encoding) { if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding)) printf("Encoding '%s' as stated in document has been ignored\n", encoding); } for (int it = 0; it < wget_vector_size(res->uris); it++) { WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it); wget_string_t *url = &html_url->url; printf(" %s.%s '%.*s'\n", html_url->dir, html_url->attr, (int) url->len, url->p); } wget_xfree(data); wget_html_free_urls_inline(&res); } }
static void _metalink_parse(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED) { _metalink_context_t *ctx = context; char value[len + 1]; // info_printf("\n%02X %s %s '%s'\n", flags, dir, attr, value); if (!(flags & (XML_FLG_CONTENT | XML_FLG_ATTRIBUTE))) return; // ignore comments if (wget_strncasecmp_ascii(dir, "/metalink/file", 14)) return; dir += 14; memcpy(value, val, len); value[len] = 0; if (!wget_strncasecmp_ascii(dir, "s/file", 6)) { // metalink 3 XML format dir += 6; if (attr) { if (*dir == 0) { // /metalink/file if (!wget_strcasecmp_ascii(attr, "name")) { ctx->metalink->name = wget_strmemdup(val, len); } } else if (!wget_strcasecmp_ascii(dir, "/verification/pieces")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } else if (!wget_strcasecmp_ascii(attr, "length")) { ctx->length = atoll(value); } // } else if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) { // if (!wget_strcasecmp_ascii(attr, "type")) { // ctx->id = atoi(value); // } } else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } } else if (!wget_strcasecmp_ascii(dir, "/resources/url")) { if (!wget_strcasecmp_ascii(attr, "location")) { sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code // } else if (!wget_strcasecmp_ascii(attr, "protocol")) { // sscanf(value, " %7[a-zA-Z]", ctx->protocol); // type of URL, e.g. HTTP, HTTPS, FTP, ... // } else if (!wget_strcasecmp_ascii(attr, "type")) { // sscanf(value, " %2[a-zA-Z]", ctx->type); // type of URL, e.g. HTTP, FTP, ... } else if (!wget_strcasecmp_ascii(attr, "preference")) { sscanf(value, " %6d", &ctx->priority); if (ctx->priority < 1 || ctx->priority > 999999) ctx->priority = 999999; } } } else { if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) { _add_piece(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) { _add_file_hash(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/size")) { ctx->metalink->size = atoll(value); } else if (!wget_strcasecmp_ascii(dir, "/resources/url")) { _add_mirror(ctx, value); } } } else { // metalink 4 XML format if (attr) { if (*dir == 0) { // /metalink/file if (!wget_strcasecmp_ascii(attr, "name")) { ctx->metalink->name = wget_strmemdup(val, len); } } else if (!wget_strcasecmp_ascii(dir, "/pieces")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } else if (!wget_strcasecmp_ascii(attr, "length")) { ctx->length = atoll(value); } } else if (!wget_strcasecmp_ascii(dir, "/hash")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } } else if (!wget_strcasecmp_ascii(dir, "/url")) { if (!wget_strcasecmp_ascii(attr, "location")) { sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code } else if (!wget_strcasecmp_ascii(attr, "priority") || !wget_strcasecmp_ascii(attr, "preference")) { sscanf(value, " %6d", &ctx->priority); if (ctx->priority < 1 || ctx->priority > 999999) ctx->priority = 999999; } } } else { if (!wget_strcasecmp_ascii(dir, "/pieces/hash")) { _add_piece(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/hash")) { _add_file_hash(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/size")) { ctx->metalink->size = atoll(value); } else if (!wget_strcasecmp_ascii(dir, "/url")) { _add_mirror(ctx, value); } } } }
static int compare_txt(struct ENTRY *a1, struct ENTRY *a2) { return wget_strcasecmp_ascii(a1->txt, a2->txt); }
static void test_strcasecmp_ascii(void) { static const struct test_data { const char * s1; const char * s2; int result; } test_data[] = { { NULL, NULL, 0 }, { NULL, "x", -1 }, { "x", NULL, 1 }, { "Abc", "abc", 0 }, { "abc", "abc", 0 }, { "abc", "ab", 'c' }, { "ab", "abc", -'c' }, { "abc", "", 'a' }, { "", "abc", -'a' }, }; static const struct test_data2 { const char * s1; const char * s2; size_t n; int result; } test_data2[] = { { NULL, NULL, 1, 0 }, { NULL, "x", 1, -1 }, { "x", NULL, 1, 1 }, { "Abc", "abc", 2, 0 }, { "abc", "abc", 3, 0 }, { "abc", "ab", 2, 0 }, { "abc", "ab", 3, 'c' }, { "ab", "abc", 2, 0 }, { "ab", "abc", 3, -'c' }, { "abc", "", 1, 'a' }, { "", "abc", 1, -'a' }, { "", "abc", 0, 0 }, }; for (unsigned it = 0; it < countof(test_data); it++) { const struct test_data *t = &test_data[it]; int n = wget_strcasecmp_ascii(t->s1, t->s2); if (n == t->result) ok++; else { failed++; info_printf("Failed [%u]: wget_strcasecmp_ascii(%s,%s) -> %d (expected %d)\n", it, t->s1, t->s2, n, t->result); } } for (unsigned it = 0; it < countof(test_data2); it++) { const struct test_data2 *t = &test_data2[it]; int n = wget_strncasecmp_ascii(t->s1, t->s2, t->n); if (n == t->result) ok++; else { failed++; info_printf("Failed [%u]: wget_strncasecmp_ascii(%s,%s,%zu) -> %d (expected %d)\n", it, t->s1, t->s2, t->n, n, t->result); } } for (unsigned it = 0; it < 26; it++) { char s1[8], s2[8]; s1[0] = 'a' + it; s1[1] = 0; s2[0] = 'A' + it; s2[1] = 0; if (wget_strcasecmp_ascii(s1, s2) == 0) ok++; else { failed++; info_printf("Failed: wget_strcasecmp_ascii(%s,%s) != 0\n", s1, s2); } if (wget_strncasecmp_ascii(s1, s2, 1) == 0) ok++; else { failed++; info_printf("Failed: wget_strncasecmp_ascii(%s,%s) != 0\n", s1, s2); } } }
static void test_parse_challenge(void) { static const struct test_data { const char * input; const char * scheme[3]; } test_data[] = { { // simplebasic "Basic realm=\"foo\"", { "Basic", NULL } }, { // simplebasicucase "BASIC REALM=\"foo\"", { "Basic", NULL } }, { // simplebasicucase "Basic , realm=\"foo\"", { "Basic", NULL } }, { // "Basic realm=\"test realm\"", { "Basic", NULL } }, { // "Basic realm=\"test-äöÜ\"", { "Basic", NULL } }, { // "Basic realm=\"basic\", Newauth realm=\"newauth\"", { "Basic", "Newauth", NULL } }, }; wget_vector_t *challenges; wget_http_challenge_t *challenge; // Testcases found here http://greenbytes.de/tech/tc/httpauth/ challenges = wget_vector_create(2, 2, NULL); wget_vector_set_destructor(challenges, (void(*)(void *))wget_http_free_challenge); for (unsigned it = 0; it < countof(test_data); it++) { const struct test_data *t = &test_data[it]; wget_http_parse_challenges(t->input, challenges); for (unsigned nchal = 0; nchal < countof(test_data[0].scheme) && t->scheme[nchal]; nchal++) { challenge = wget_vector_get(challenges, nchal); if (!t->scheme[nchal]) { if (challenge) { failed++; info_printf("Failed [%u]: wget_http_parse_challenges(%s) found %d challenges (expected %u)\n", it, t->input, wget_vector_size(challenges), nchal); } break; } if (!challenge) { failed++; info_printf("Failed [%u]: wget_http_parse_challenges(%s) did not find enough challenges\n", it, t->input); break; } if (!wget_strcasecmp_ascii(challenge->auth_scheme, t->scheme[nchal])) { ok++; } else { failed++; info_printf("Failed [%u]: wget_http_parse_challenges(%s) -> '%s' (expected '%s')\n", it, t->input, challenge->auth_scheme, t->scheme[nchal]); } } wget_vector_clear(challenges); } wget_http_free_challenges(&challenges); }