static void _add_mirror(_metalink_context_t *ctx, const char *value) { if (wget_strncasecmp_ascii(value, "http:", 5) && wget_strncasecmp_ascii(value, "https:", 6)) return; wget_metalink_t *metalink = ctx->metalink; wget_metalink_mirror_t mirror; memset(&mirror, 0, sizeof(wget_metalink_mirror_t)); strlcpy(mirror.location, ctx->location, sizeof(mirror.location)); mirror.priority = ctx->priority; mirror.iri = wget_iri_parse(value, NULL); if (!mirror.iri) return; if (!metalink->mirrors) { metalink->mirrors = wget_vector_create(4, 4, NULL); wget_vector_set_destructor(metalink->mirrors, (void(*)(void *))_free_mirror); } wget_vector_add(metalink->mirrors, &mirror, sizeof(wget_metalink_mirror_t)); *ctx->location = 0; ctx->priority = 999999; }
static void _atom_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED) { struct atom_context *ctx = context; wget_string_t url; if (!val || !len) return; url.p = NULL; if ((flags & XML_FLG_ATTRIBUTE)) { if (!wget_strcasecmp_ascii(attr, "href") || !wget_strcasecmp_ascii(attr, "uri") || !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "scheme") || !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6)) { for (;len && c_isspace(*val); val++, len--); // skip leading spaces for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces url.p = val; url.len = len; if (!ctx->urls) ctx->urls = wget_vector_create(32, -2, NULL); wget_vector_add(ctx->urls, &url, sizeof(url)); } } else if ((flags & XML_FLG_CONTENT)) { const char *elem = strrchr(dir, '/'); if (elem) { elem++; if (!wget_strcasecmp_ascii(elem, "icon") || !wget_strcasecmp_ascii(elem, "id") || !wget_strcasecmp_ascii(elem, "logo")) { for (;len && c_isspace(*val); val++, len--); // skip leading spaces for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces // debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len); url.p = val; url.len = len; if (!ctx->urls) ctx->urls = wget_vector_create(32, -2, NULL); wget_vector_add(ctx->urls, &url, sizeof(url)); } } } }
void wget_css_parse_buffer( const char *buf, void(*callback_uri)(void *user_ctx, const char *url, size_t len, size_t pos), void(*callback_encoding)(void *user_ctx, const char *url, size_t len), void *user_ctx) { int token; size_t length, pos = 0; char *text; yyscan_t scanner; // let flex operate on buf as a 0 terminated string // we could give buflen to this function and use yy_scan_bytes or yy_scan_buffer yylex_init(&scanner); yy_scan_string(buf, scanner); while ((token = yylex(scanner)) != CSSEOF) { if (token == IMPORT_SYM) { // e.g. @import "http:example.com/index.html" pos += yyget_leng(scanner); // skip whitespace before URI/STRING while ((token = yylex(scanner)) == S) pos += yyget_leng(scanner); // now token should be STRING or URI if (token == STRING) token = URI; } if (token == URI && callback_uri) { // e.g. url(http:example.com/index.html) text = yyget_text(scanner); length = yyget_leng(scanner); if (*text == '\'' || *text == '\"') { // a string - remove the quotes callback_uri(user_ctx, text + 1, length - 2, pos + 1); } else { // extract URI from url(...) if (!wget_strncasecmp_ascii(text, "url(", 4)) { char *otext = text; // remove trailing ) and any spaces before for (length--; c_isspace(text[length - 1]); length--); // remove leading url( and any spaces after for (length -= 4, text += 4; c_isspace(*text); text++, length--); // remove quotes if (*text == '\'' || *text == '\"') { text++; length -= 2; } callback_uri(user_ctx, text, length, pos + (text - otext)); } } } else if (token == CHARSET_SYM && callback_encoding) { // e.g. @charset "UTF-8" pos += yyget_leng(scanner); // skip whitespace before charset name while ((token = yylex(scanner)) == S) pos += yyget_leng(scanner); // now token should be STRING if (token == STRING) { text = yyget_text(scanner); length = yyget_leng(scanner); if (*text == '\'' || *text == '\"') { // a string - remove the quotes callback_encoding(user_ctx, text + 1, length - 2); } else { // a string without quotes callback_encoding(user_ctx, text, length); } } else { error_printf(_("Unknown token after @charset: %d\n"), token); } } pos += yyget_leng(scanner); } yylex_destroy(scanner); }
static void html_parse_localfile(const char *fname) { char *data; const char *encoding = NULL; size_t len; if ((data = wget_read_file(fname, &len))) { if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) { // Big-endian UTF-16 encoding = "UTF-16BE"; // adjust behind BOM, ignore trailing single byte data += 2; len -= 2; } else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) { // Little-endian UTF-16 encoding = "UTF-16LE"; // adjust behind BOM data += 2; len -= 2; } else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) { // UTF-8 encoding = "UTF-8"; // adjust behind BOM data += 3; len -= 3; } if (encoding) printf("URI encoding '%s' set by BOM\n", encoding); if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) { size_t n; char *utf8; len -= len & 1; // ignore single trailing byte, else charset conversion fails if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) { printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding); data = utf8; } else { printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding); return; } } WGET_HTML_PARSED_RESULT *res = wget_html_get_urls_inline(data, NULL, NULL); if (encoding) { if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding)) printf("Encoding '%s' as stated in document has been ignored\n", encoding); } for (int it = 0; it < wget_vector_size(res->uris); it++) { WGET_HTML_PARSED_URL *html_url = wget_vector_get(res->uris, it); wget_string_t *url = &html_url->url; printf(" %s.%s '%.*s'\n", html_url->dir, html_url->attr, (int) url->len, url->p); } wget_xfree(data); wget_html_free_urls_inline(&res); } }
static void _metalink_parse(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED) { _metalink_context_t *ctx = context; char value[len + 1]; // info_printf("\n%02X %s %s '%s'\n", flags, dir, attr, value); if (!(flags & (XML_FLG_CONTENT | XML_FLG_ATTRIBUTE))) return; // ignore comments if (wget_strncasecmp_ascii(dir, "/metalink/file", 14)) return; dir += 14; memcpy(value, val, len); value[len] = 0; if (!wget_strncasecmp_ascii(dir, "s/file", 6)) { // metalink 3 XML format dir += 6; if (attr) { if (*dir == 0) { // /metalink/file if (!wget_strcasecmp_ascii(attr, "name")) { ctx->metalink->name = wget_strmemdup(val, len); } } else if (!wget_strcasecmp_ascii(dir, "/verification/pieces")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } else if (!wget_strcasecmp_ascii(attr, "length")) { ctx->length = atoll(value); } // } else if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) { // if (!wget_strcasecmp_ascii(attr, "type")) { // ctx->id = atoi(value); // } } else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } } else if (!wget_strcasecmp_ascii(dir, "/resources/url")) { if (!wget_strcasecmp_ascii(attr, "location")) { sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code // } else if (!wget_strcasecmp_ascii(attr, "protocol")) { // sscanf(value, " %7[a-zA-Z]", ctx->protocol); // type of URL, e.g. HTTP, HTTPS, FTP, ... // } else if (!wget_strcasecmp_ascii(attr, "type")) { // sscanf(value, " %2[a-zA-Z]", ctx->type); // type of URL, e.g. HTTP, FTP, ... } else if (!wget_strcasecmp_ascii(attr, "preference")) { sscanf(value, " %6d", &ctx->priority); if (ctx->priority < 1 || ctx->priority > 999999) ctx->priority = 999999; } } } else { if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) { _add_piece(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) { _add_file_hash(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/size")) { ctx->metalink->size = atoll(value); } else if (!wget_strcasecmp_ascii(dir, "/resources/url")) { _add_mirror(ctx, value); } } } else { // metalink 4 XML format if (attr) { if (*dir == 0) { // /metalink/file if (!wget_strcasecmp_ascii(attr, "name")) { ctx->metalink->name = wget_strmemdup(val, len); } } else if (!wget_strcasecmp_ascii(dir, "/pieces")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } else if (!wget_strcasecmp_ascii(attr, "length")) { ctx->length = atoll(value); } } else if (!wget_strcasecmp_ascii(dir, "/hash")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } } else if (!wget_strcasecmp_ascii(dir, "/url")) { if (!wget_strcasecmp_ascii(attr, "location")) { sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code } else if (!wget_strcasecmp_ascii(attr, "priority") || !wget_strcasecmp_ascii(attr, "preference")) { sscanf(value, " %6d", &ctx->priority); if (ctx->priority < 1 || ctx->priority > 999999) ctx->priority = 999999; } } } else { if (!wget_strcasecmp_ascii(dir, "/pieces/hash")) { _add_piece(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/hash")) { _add_file_hash(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/size")) { ctx->metalink->size = atoll(value); } else if (!wget_strcasecmp_ascii(dir, "/url")) { _add_mirror(ctx, value); } } } }
static void test_strcasecmp_ascii(void) { static const struct test_data { const char * s1; const char * s2; int result; } test_data[] = { { NULL, NULL, 0 }, { NULL, "x", -1 }, { "x", NULL, 1 }, { "Abc", "abc", 0 }, { "abc", "abc", 0 }, { "abc", "ab", 'c' }, { "ab", "abc", -'c' }, { "abc", "", 'a' }, { "", "abc", -'a' }, }; static const struct test_data2 { const char * s1; const char * s2; size_t n; int result; } test_data2[] = { { NULL, NULL, 1, 0 }, { NULL, "x", 1, -1 }, { "x", NULL, 1, 1 }, { "Abc", "abc", 2, 0 }, { "abc", "abc", 3, 0 }, { "abc", "ab", 2, 0 }, { "abc", "ab", 3, 'c' }, { "ab", "abc", 2, 0 }, { "ab", "abc", 3, -'c' }, { "abc", "", 1, 'a' }, { "", "abc", 1, -'a' }, { "", "abc", 0, 0 }, }; for (unsigned it = 0; it < countof(test_data); it++) { const struct test_data *t = &test_data[it]; int n = wget_strcasecmp_ascii(t->s1, t->s2); if (n == t->result) ok++; else { failed++; info_printf("Failed [%u]: wget_strcasecmp_ascii(%s,%s) -> %d (expected %d)\n", it, t->s1, t->s2, n, t->result); } } for (unsigned it = 0; it < countof(test_data2); it++) { const struct test_data2 *t = &test_data2[it]; int n = wget_strncasecmp_ascii(t->s1, t->s2, t->n); if (n == t->result) ok++; else { failed++; info_printf("Failed [%u]: wget_strncasecmp_ascii(%s,%s,%zu) -> %d (expected %d)\n", it, t->s1, t->s2, t->n, n, t->result); } } for (unsigned it = 0; it < 26; it++) { char s1[8], s2[8]; s1[0] = 'a' + it; s1[1] = 0; s2[0] = 'A' + it; s2[1] = 0; if (wget_strcasecmp_ascii(s1, s2) == 0) ok++; else { failed++; info_printf("Failed: wget_strcasecmp_ascii(%s,%s) != 0\n", s1, s2); } if (wget_strncasecmp_ascii(s1, s2, 1) == 0) ok++; else { failed++; info_printf("Failed: wget_strncasecmp_ascii(%s,%s) != 0\n", s1, s2); } } }