int mget_iri_isgendelim(char c) { // return strchr(":/?#[]@",c)!=NULL; return _iri_isgendelim(c); }
mget_iri_t *mget_iri_parse(const char *url, const char *encoding) { mget_iri_t *iri; const char *default_port = NULL; char *p, *s, *authority, c; size_t slen, it; int url_allocated, maybe_scheme; if (!url) return NULL; /* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ while (isspace(*url)) url++; if (!*url) return NULL; // first unescape, than convert to UTF-8 if (strchr(url, '%')) { char *unesc_url = strdup(url); mget_percent_unescape(unesc_url); if (mget_str_needs_encoding(unesc_url)) { if ((url = mget_str_to_utf8(unesc_url, encoding))) xfree(unesc_url); else url = unesc_url; // on error, use what we have } else url = unesc_url; url_allocated = 1; } else { url_allocated = 0; if (mget_str_needs_encoding(url)) { if ((s = mget_str_to_utf8(url, encoding))) { url = s; url_allocated = 1; } } } // just use one block of memory for all parsed URI parts slen = strlen(url); iri = xmalloc(sizeof(mget_iri_t) + slen * 2 + 2); memset(iri, 0, sizeof(mget_iri_t)); strcpy(((char *)iri) + sizeof(mget_iri_t), url); iri->uri = ((char *)iri) + sizeof(mget_iri_t); s = ((char *)iri) + sizeof(mget_iri_t) + slen + 1; strcpy(s, url); if (url_allocated) xfree(url); p = s; if (isalpha(*p)) { maybe_scheme = 1; while (*s && !_iri_isgendelim(*s)) { if (maybe_scheme && !_iri_isscheme(*s)) maybe_scheme = 0; s++; } } else maybe_scheme = 0; if (maybe_scheme && (*s == ':' && (s[1] == '/' || s[1] == 0))) { // found a scheme *s++ = 0; // find the scheme in our static list of supported schemes // for later comparisons we compare pointers (avoiding strcasecmp()) iri->scheme = p; for (it = 0; mget_iri_schemes[it]; it++) { if (!mget_strcasecmp_ascii(mget_iri_schemes[it], p)) { iri->scheme = mget_iri_schemes[it]; default_port = iri_ports[it]; break; } } if (iri->scheme == p) { // convert scheme to lowercase mget_strtolower((char *)iri->scheme); } } else { iri->scheme = MGET_IRI_SCHEME_DEFAULT; default_port = iri_ports[0]; // port 80 s = p; // rewind } // this is true for http, https, ftp, file if (s[0] == '/' && s[1] == '/') s += 2; // authority authority = s; while (*s && *s != '/' && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; // left over: [path][?query][#fragment] if (c == '/') { iri->path = s; while (*s && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '?') { iri->query = s; while (*s && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '#') { iri->fragment = s; s += strlen(s); } if (*s) { debug_printf("unparsed rest '%s'\n", s); } if (*authority) { s = authority; p = strchr(authority, '@'); if (p) { iri->userinfo = s; *p = 0; s = p + 1; } if (*s == '[') { p = strrchr(s, ']'); if (p) { iri->host = s + 1; *p = 0; s = p + 1; } else { // something is broken iri->host = s + 1; s += strlen(s); } } else { iri->host = s; while (*s && *s != ':') s++; } if (*s == ':') { if (s[1]) { if (!default_port || (strcmp(s + 1, default_port) && atoi(s + 1) != atoi(default_port))) iri->port = s + 1; } } *s = 0; } iri->resolv_port = iri->port ? iri->port : default_port; // now unescape all components (not interested in display, userinfo, password) if (iri->host) { mget_strtolower((char *)iri->host); if ((p = (char *)mget_str_to_ascii(iri->host)) != iri->host) { iri->host = p; iri->host_allocated = 1; } } else { if (iri->scheme == MGET_IRI_SCHEME_HTTP || iri->scheme == MGET_IRI_SCHEME_HTTPS) { error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri); mget_iri_free(&iri); return NULL; } } /* debug_printf("scheme=%s\n",iri->scheme); debug_printf("host=%s\n",iri->host); debug_printf("path=%s\n",iri->path); debug_printf("query=%s\n",iri->query); debug_printf("fragment=%s\n",iri->fragment); */ return iri; }
MGET_IRI *mget_iri_parse(const char *s_uri, const char *encoding) { MGET_IRI *iri; const char *default_port = NULL; char *p, *s, *authority, c; size_t slen, it; if (!s_uri) return NULL; /* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ while (isspace(*s_uri)) s_uri++; if (!*s_uri) return NULL; // just use one block of memory for all parsed URI parts slen = strlen(s_uri); iri = xmalloc(sizeof(MGET_IRI) + slen * 2 + 2); memset(iri, 0, sizeof(MGET_IRI)); strcpy(((char *)iri) + sizeof(MGET_IRI), s_uri); iri->uri = ((char *)iri) + sizeof(MGET_IRI); s = ((char *)iri) + sizeof(MGET_IRI) + slen + 1; strcpy(s, s_uri); p = s; while (*s && !_iri_isgendelim(*s)) s++; if (*s == ':' && s[1]=='/') { // found a scheme *s++ = 0; // find the scheme in our static list of supported schemes // for later comparisons we compare pointers (avoiding strcasecmnp()) iri->scheme = p; for (it = 0; iri_schemes[it]; it++) { if (!strcasecmp(iri_schemes[it], p)) { iri->scheme = iri_schemes[it]; default_port = iri_ports[it]; break; } } if (iri->scheme == p) { // convert scheme to lowercase for (; *p; p++) if (isupper(*p)) *p = tolower(*p); } } else { iri->scheme = IRI_SCHEME_DEFAULT; default_port = iri_ports[0]; // port 80 s = p; // rewind } // this is true for http, https, ftp, file if (s[0] == '/' && s[1] == '/') s += 2; // authority authority = s; while (*s && *s != '/' && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; // left over: [path][?query][#fragment] if (c == '/') { iri->path = s; while (*s && *s != '?' && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '?') { iri->query = s; while (*s && *s != '#') s++; c = *s; if (c) *s++ = 0; } if (c == '#') { iri->fragment = s; while (*s) s++; } if (*s) { debug_printf("unparsed rest '%s'\n", s); } if (*authority) { s = authority; p = strchr(authority, '@'); if (p) { iri->userinfo = s; *p = 0; s = p + 1; } if (*s == '[') { p = strrchr(s, ']'); if (p) { iri->host = s + 1; *p = 0; s = p + 1; } else { // something is broken iri->host = s + 1; while (*s) s++; } } else { iri->host = s; while (*s && *s != ':') s++; } if (*s == ':') { if (s[1]) { if (!default_port || (strcmp(s + 1, default_port) && atoi(s + 1) != atoi(default_port))) iri->port = s + 1; } } *s = 0; /* for (p = (char *)iri->host; *p; p++) if (*p >= 'A' && *p <= 'Z') // isupper() also returns true for chars > 0x7f, the test is not EBCDIC compatible ;-) *p = tolower(*p); */ } iri->resolv_port = iri->port ? iri->port : default_port; // now unescape all components (not interested in display, userinfo, password if (iri->host) { const char *host_utf; char *p; _unescape((unsigned char *)iri->host); host_utf = mget_str_to_utf8(iri->host, encoding); if (host_utf) { char *host_asc = NULL; int rc; if ((rc = idna_to_ascii_8z(host_utf, &host_asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { // log_printf("toASCII '%s' -> '%s'\n", host_utf, host_asc); iri->host = host_asc; iri->host_allocated = 1; } else error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); xfree(host_utf); } for (p = (char *)iri->host; *p; p++) if (*p >= 'A' && *p <= 'Z') // isupper() also returns true for chars > 0x7f, the test is not EBCDIC compatible ;-) *p = tolower(*p); } else { if (iri->scheme == IRI_SCHEME_HTTP || iri->scheme == IRI_SCHEME_HTTPS) { error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri); mget_iri_free(&iri); return NULL; } } if (iri->path) _unescape((unsigned char *)iri->path); if (iri->query) _unescape((unsigned char *)iri->query); if (iri->fragment) _unescape((unsigned char *)iri->fragment); // info_printf("%s: path '%s'\n", iri->uri, iri->path); return iri; }