// Callback function, called from CSS parser for each @charset found. static void _css_get_encoding(void *context, const char *encoding, size_t len) { _CSS_CONTEXT *ctx = context; // take only the first @charset rule if (!*ctx->encoding) { *ctx->encoding = wget_strmemdup(encoding, len); debug_printf(_("URI content encoding = '%s'\n"), *ctx->encoding); } }
char *wget_charset_transcode(const char *src, const char *src_encoding, const char *dst_encoding) { if (!src) return NULL; #ifdef HAVE_ICONV if (!src_encoding) src_encoding = "iso-8859-1"; // default character-set for most browsers if (!dst_encoding) dst_encoding = "iso-8859-1"; // default character-set for most browsers if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) { char *ret = NULL; iconv_t cd=iconv_open(dst_encoding, src_encoding); if (cd != (iconv_t)-1) { char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself size_t tmp_len = strlen(src); size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len; char *dst = xmalloc(dst_len + 1), *dst_tmp = dst; if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1 && iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1) { ret = wget_strmemdup(dst, dst_len - dst_len_tmp); debug_printf("converted '%s' (%s) -> '%s' (%s)\n", src, src_encoding, ret, dst_encoding); } else error_printf(_("Failed to convert '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno); xfree(dst); iconv_close(cd); } else error_printf(_("Failed to prepare encoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno); return ret; } #endif return strdup(src); }
// Callback function, called from CSS parser for each URI found. static void _css_get_url(void *context, const char *url, size_t len, size_t pos) { _CSS_CONTEXT *ctx = context; WGET_PARSED_URL parsed_url = { .len = len, .pos = pos, .url = wget_strmemdup(url, len), .abs_url = NULL }; if (!ctx->uris) { ctx->uris = wget_vector_create(16, -2, NULL); wget_vector_set_destructor(ctx->uris, (wget_vector_destructor_t)_free_url); } wget_vector_add(ctx->uris, &parsed_url, sizeof(parsed_url)); } static void _urls_to_absolute(wget_vector_t *urls, wget_iri_t *base) { if (base && urls) { wget_buffer_t buf; wget_buffer_init(&buf, NULL, 1024); for (int it = 0; it < wget_vector_size(urls); it++) { WGET_PARSED_URL *url = wget_vector_get(urls, it); if (wget_iri_relative_to_abs(base, url->url, url->len, &buf)) url->abs_url = wget_strmemdup(buf.data, buf.length); else error_printf("Cannot resolve relative URI '%s'\n", url->url); } wget_buffer_deinit(&buf); } } wget_vector_t *wget_css_get_urls(const char *css, size_t len, wget_iri_t *base, const char **encoding) { _CSS_CONTEXT context = { .encoding = encoding }; wget_css_parse_buffer(css, len, _css_get_url, encoding ? _css_get_encoding : NULL, &context); _urls_to_absolute(context.uris, base); return context.uris; }
const char *wget_str_to_ascii(const char *src) { #ifdef WITH_LIBIDN2 if (wget_str_needs_encoding(src)) { char *asc = NULL; int rc; #ifdef WITH_LIBUNISTRING uint8_t *lower, resbuf[256]; size_t len = sizeof(resbuf) - 1; // leave space for additional \0 byte // we need a conversion to lowercase lower = u8_tolower((uint8_t *)src, u8_strlen((uint8_t *)src), 0, UNINORM_NFKC, resbuf, &len); if (!lower) { error_printf("u8_tolower(%s) failed (%d)\n", src, errno); return src; } // u8_tolower() does not terminate the result string if (lower == resbuf) { lower[len]=0; } else { uint8_t *tmp = lower; lower = (uint8_t *)wget_strmemdup((char *)lower, len); xfree(tmp); } if ((rc = idn2_lookup_u8(lower, (uint8_t **)&asc, 0)) == IDN2_OK) { debug_printf("idn2 '%s' -> '%s'\n", src, asc); src = asc; } else error_printf(_("toASCII(%s) failed (%d): %s\n"), lower, rc, idn2_strerror(rc)); if (lower != resbuf) xfree(lower); #else if ((rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, 0)) == IDN2_OK) { debug_printf("idn2 '%s' -> '%s'\n", src, asc); src = asc; } else error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc)); #endif } #elif WITH_LIBIDN if (wget_str_needs_encoding(src)) { char *asc = NULL; int rc; if (_utf8_is_valid(src)) { // idna_to_ascii_8z() automatically converts UTF-8 to lowercase if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { // debug_printf("toASCII '%s' -> '%s'\n", src, asc); src = asc; } else error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); } else error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src); } #else if (wget_str_needs_encoding(src)) { error_printf(_("toASCII not available: '%s'\n"), src); } #endif return src; }
int wget_netrc_db_load(wget_netrc_db_t *netrc_db, const char *fname) { wget_netrc_t netrc; FILE *fp; char *buf = NULL, *linep, *p, *key = NULL; size_t bufsize = 0; ssize_t buflen; int nentries = 0, in_macdef = 0, in_machine = 0; if (!netrc_db || !fname || !*fname) return -1; if ((fp = fopen(fname, "r"))) { while ((buflen = wget_getline(&buf, &bufsize, fp)) >= 0) { linep = buf; while (isspace(*linep)) linep++; // ignore leading whitespace if (*linep == '#') continue; // skip comments // strip off \r\n while (buflen > 0 && (buf[buflen] == '\n' || buf[buflen] == '\r')) buf[--buflen] = 0; if (!*linep) { // empty lines reset macro processing in_macdef = 0; continue; } else if (in_macdef) continue; // still processing 'macdef' macro // now we expect key value pairs, e.g.: machine example.com xfree(key); for (p = linep; *linep && !isspace(*linep);) linep++; key = wget_strmemdup(p, linep - p); if (!strcmp(key, "machine") || !strcmp(key, "default")) { if (in_machine) wget_netrc_db_add(netrc_db, wget_memdup(&netrc, sizeof(netrc))); wget_netrc_init(&netrc); in_machine = 1; if (!strcmp(key, "default")) { netrc.key = wget_strdup("default"); continue; } } else if (!in_machine) continue; // token outside of machine or default while (isspace(*linep)) linep++; for (p = linep; *linep && !isspace(*linep);) linep++; if (!strcmp(key, "login")) { if (!netrc.login) netrc.login = wget_strmemdup(p, linep - p); } else if (!strcmp(key, "password")) { if (!netrc.password) netrc.password = wget_strmemdup(p, linep - p); } else if (!strcmp(key, "macdef")) { in_macdef = 1; // the above code skips until next empty line } } if (in_machine) wget_netrc_db_add(netrc_db, wget_memdup(&netrc, sizeof(netrc))); xfree(key); xfree(buf); fclose(fp); nentries = wget_hashmap_size(netrc_db->machines); debug_printf("loaded %d .netrc %s\n", nentries, nentries != 1 ? "entries" : "entry"); } else if (errno != ENOENT) error_printf(_("Failed to open .netrc file '%s' (%d)\n"), fname, errno); return nentries; }
static int _ocsp_db_load(wget_ocsp_db_t *ocsp_db, FILE *fp, int load_hosts) { wget_ocsp_t ocsp; char *buf = NULL, *linep, *p; size_t bufsize = 0; ssize_t buflen; time_t now = time(NULL); int ok; while ((buflen = wget_getline(&buf, &bufsize, fp)) >= 0) { linep = buf; while (isspace(*linep)) linep++; // ignore leading whitespace if (!*linep) continue; // skip empty lines if (*linep == '#') continue; // skip comments // strip off \r\n while (buflen > 0 && (buf[buflen] == '\n' || buf[buflen] == '\r')) buf[--buflen] = 0; wget_ocsp_init(&ocsp); ok = 0; // parse cert's sha-256 checksum if (*linep) { for (p = linep; *linep && !isspace(*linep);) linep++; ocsp.key = wget_strmemdup(p, linep - p); } // parse max age if (*linep) { for (p = ++linep; *linep && !isspace(*linep);) linep++; ocsp.maxage = atol(p); if (ocsp.maxage < now) { // drop expired entry wget_ocsp_deinit(&ocsp); continue; } ok = 1; } // parse mtime (age of this entry) if (*linep) { for (p = ++linep; *linep && !isspace(*linep);) linep++; ocsp.mtime = atol(p); } // parse mtime (age of this entry) if (*linep) { for (p = ++linep; *linep && !isspace(*linep);) linep++; ocsp.valid = atoi(p); } if (ok) { if (load_hosts) wget_ocsp_db_add_host(ocsp_db, wget_memdup(&ocsp, sizeof(ocsp))); else wget_ocsp_db_add_fingerprint(ocsp_db, wget_memdup(&ocsp, sizeof(ocsp))); } else { wget_ocsp_deinit(&ocsp); error_printf(_("Failed to parse OCSP line: '%s'\n"), buf); } } xfree(buf); if (ferror(fp)) return -1; return 0; }
static void _metalink_parse(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos G_GNUC_WGET_UNUSED) { _metalink_context_t *ctx = context; char value[len + 1]; // info_printf("\n%02X %s %s '%s'\n", flags, dir, attr, value); if (!(flags & (XML_FLG_CONTENT | XML_FLG_ATTRIBUTE))) return; // ignore comments if (wget_strncasecmp_ascii(dir, "/metalink/file", 14)) return; dir += 14; memcpy(value, val, len); value[len] = 0; if (!wget_strncasecmp_ascii(dir, "s/file", 6)) { // metalink 3 XML format dir += 6; if (attr) { if (*dir == 0) { // /metalink/file if (!wget_strcasecmp_ascii(attr, "name")) { ctx->metalink->name = wget_strmemdup(val, len); } } else if (!wget_strcasecmp_ascii(dir, "/verification/pieces")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } else if (!wget_strcasecmp_ascii(attr, "length")) { ctx->length = atoll(value); } // } else if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) { // if (!wget_strcasecmp_ascii(attr, "type")) { // ctx->id = atoi(value); // } } else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } } else if (!wget_strcasecmp_ascii(dir, "/resources/url")) { if (!wget_strcasecmp_ascii(attr, "location")) { sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code // } else if (!wget_strcasecmp_ascii(attr, "protocol")) { // sscanf(value, " %7[a-zA-Z]", ctx->protocol); // type of URL, e.g. HTTP, HTTPS, FTP, ... // } else if (!wget_strcasecmp_ascii(attr, "type")) { // sscanf(value, " %2[a-zA-Z]", ctx->type); // type of URL, e.g. HTTP, FTP, ... } else if (!wget_strcasecmp_ascii(attr, "preference")) { sscanf(value, " %6d", &ctx->priority); if (ctx->priority < 1 || ctx->priority > 999999) ctx->priority = 999999; } } } else { if (!wget_strcasecmp_ascii(dir, "/verification/pieces/hash")) { _add_piece(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/verification/hash")) { _add_file_hash(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/size")) { ctx->metalink->size = atoll(value); } else if (!wget_strcasecmp_ascii(dir, "/resources/url")) { _add_mirror(ctx, value); } } } else { // metalink 4 XML format if (attr) { if (*dir == 0) { // /metalink/file if (!wget_strcasecmp_ascii(attr, "name")) { ctx->metalink->name = wget_strmemdup(val, len); } } else if (!wget_strcasecmp_ascii(dir, "/pieces")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } else if (!wget_strcasecmp_ascii(attr, "length")) { ctx->length = atoll(value); } } else if (!wget_strcasecmp_ascii(dir, "/hash")) { if (!wget_strcasecmp_ascii(attr, "type")) { sscanf(value, "%15s", ctx->hash_type); } } else if (!wget_strcasecmp_ascii(dir, "/url")) { if (!wget_strcasecmp_ascii(attr, "location")) { sscanf(value, " %2[a-zA-Z]", ctx->location); // ISO 3166-1 alpha-2 two letter country code } else if (!wget_strcasecmp_ascii(attr, "priority") || !wget_strcasecmp_ascii(attr, "preference")) { sscanf(value, " %6d", &ctx->priority); if (ctx->priority < 1 || ctx->priority > 999999) ctx->priority = 999999; } } } else { if (!wget_strcasecmp_ascii(dir, "/pieces/hash")) { _add_piece(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/hash")) { _add_file_hash(ctx, value); } else if (!wget_strcasecmp_ascii(dir, "/size")) { ctx->metalink->size = atoll(value); } else if (!wget_strcasecmp_ascii(dir, "/url")) { _add_mirror(ctx, value); } } } }