Exemple #1
0
int mget_iri_isgendelim(char c)
{
	// return strchr(":/?#[]@",c)!=NULL;
	return _iri_isgendelim(c);
}
Exemple #2
0
mget_iri_t *mget_iri_parse(const char *url, const char *encoding)
{
	mget_iri_t *iri;
	const char *default_port = NULL;
	char *p, *s, *authority, c;
	size_t slen, it;
	int url_allocated, maybe_scheme;

	if (!url)
		return NULL;

	/*
		URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
		hier-part   = "//" authority path-abempty / path-absolute / path-rootless / path-empty
		scheme      =  ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
	 */
	while (isspace(*url)) url++;
	if (!*url) return NULL;

	// first unescape, than convert to UTF-8
	if (strchr(url, '%')) {
		char *unesc_url = strdup(url);

		mget_percent_unescape(unesc_url);

		if (mget_str_needs_encoding(unesc_url)) {
			if ((url = mget_str_to_utf8(unesc_url, encoding)))
				xfree(unesc_url);
			else
				url = unesc_url; // on error, use what we have
		} else
			url = unesc_url;

		url_allocated = 1;
	} else {
		url_allocated = 0;

		if (mget_str_needs_encoding(url)) {
			if ((s = mget_str_to_utf8(url, encoding))) {
				url = s;
				url_allocated = 1;
			}
		}
	}

	// just use one block of memory for all parsed URI parts
	slen = strlen(url);
	iri = xmalloc(sizeof(mget_iri_t) + slen * 2 + 2);
	memset(iri, 0, sizeof(mget_iri_t));
	strcpy(((char *)iri) + sizeof(mget_iri_t), url);
	iri->uri = ((char *)iri) + sizeof(mget_iri_t);
	s = ((char *)iri) + sizeof(mget_iri_t) + slen + 1;
	strcpy(s, url);
	if (url_allocated)
		xfree(url);

	p = s;
	if (isalpha(*p)) {
		maybe_scheme = 1;
		while (*s && !_iri_isgendelim(*s)) {
			if (maybe_scheme && !_iri_isscheme(*s))
				maybe_scheme = 0;
			s++;
		}
	} else
		maybe_scheme = 0;

	if (maybe_scheme && (*s == ':' && (s[1] == '/' || s[1] == 0))) {
		// found a scheme
		*s++ = 0;

		// find the scheme in our static list of supported schemes
		// for later comparisons we compare pointers (avoiding strcasecmp())
		iri->scheme = p;
		for (it = 0; mget_iri_schemes[it]; it++) {
			if (!mget_strcasecmp_ascii(mget_iri_schemes[it], p)) {
				iri->scheme = mget_iri_schemes[it];
				default_port = iri_ports[it];
				break;
			}
		}

		if (iri->scheme == p) {
			// convert scheme to lowercase
			mget_strtolower((char *)iri->scheme);
		}

	} else {
		iri->scheme = MGET_IRI_SCHEME_DEFAULT;
		default_port = iri_ports[0]; // port 80
		s = p; // rewind
	}

	// this is true for http, https, ftp, file
	if (s[0] == '/' && s[1] == '/')
		s += 2;

	// authority
	authority = s;
	while (*s && *s != '/' && *s != '?' && *s != '#')
		s++;
	c = *s;
	if (c) *s++ = 0;

	// left over: [path][?query][#fragment]
	if (c == '/') {
		iri->path = s;
		while (*s && *s != '?' && *s != '#')
			s++;
		c = *s;
		if (c) *s++ = 0;
	}

	if (c == '?') {
		iri->query = s;
		while (*s && *s != '#')
			s++;
		c = *s;
		if (c) *s++ = 0;
	}

	if (c == '#') {
		iri->fragment = s;
		s += strlen(s);
	}

	if (*s) {
		debug_printf("unparsed rest '%s'\n", s);
	}

	if (*authority) {
		s = authority;
		p = strchr(authority, '@');
		if (p) {
			iri->userinfo = s;
			*p = 0;
			s = p + 1;
		}
		if (*s == '[') {
			p = strrchr(s, ']');
			if (p) {
				iri->host = s + 1;
				*p = 0;
				s = p + 1;
			} else {
				// something is broken
				iri->host = s + 1;
				s += strlen(s);
			}
		} else {
			iri->host = s;
			while (*s && *s != ':')
				s++;
		}
		if (*s == ':') {
			if (s[1]) {
				if (!default_port || (strcmp(s + 1, default_port) && atoi(s + 1) != atoi(default_port)))
					iri->port = s + 1;
			}
		}
		*s = 0;
 	}

	iri->resolv_port = iri->port ? iri->port : default_port;

	// now unescape all components (not interested in display, userinfo, password)
	if (iri->host) {
		mget_strtolower((char *)iri->host);
		if ((p = (char *)mget_str_to_ascii(iri->host)) != iri->host) {
			iri->host = p;
			iri->host_allocated = 1;
		}
	}
	else {
		if (iri->scheme == MGET_IRI_SCHEME_HTTP || iri->scheme == MGET_IRI_SCHEME_HTTPS) {
			error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri);
			mget_iri_free(&iri);
			return NULL;
		}
	}

/*
	debug_printf("scheme=%s\n",iri->scheme);
	debug_printf("host=%s\n",iri->host);
	debug_printf("path=%s\n",iri->path);
	debug_printf("query=%s\n",iri->query);
	debug_printf("fragment=%s\n",iri->fragment);
*/

	return iri;
}
Exemple #3
0
MGET_IRI *mget_iri_parse(const char *s_uri, const char *encoding)
{
	MGET_IRI *iri;
	const char *default_port = NULL;
	char *p, *s, *authority, c;
	size_t slen, it;

	if (!s_uri)
		return NULL;

	/*
		URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
		hier-part   = "//" authority path-abempty / path-absolute / path-rootless / path-empty
		scheme      =  ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
	 */
	while (isspace(*s_uri)) s_uri++;
	if (!*s_uri) return NULL;

	// just use one block of memory for all parsed URI parts
	slen = strlen(s_uri);
	iri = xmalloc(sizeof(MGET_IRI) + slen * 2 + 2);
	memset(iri, 0, sizeof(MGET_IRI));
	strcpy(((char *)iri) + sizeof(MGET_IRI), s_uri);
	iri->uri = ((char *)iri) + sizeof(MGET_IRI);
	s = ((char *)iri) + sizeof(MGET_IRI) + slen + 1;
	strcpy(s, s_uri);

	p = s;
	while (*s && !_iri_isgendelim(*s))
		s++;

	if (*s == ':' && s[1]=='/') {
		// found a scheme
		*s++ = 0;

		// find the scheme in our static list of supported schemes
		// for later comparisons we compare pointers (avoiding strcasecmnp())
		iri->scheme = p;
		for (it = 0; iri_schemes[it]; it++) {
			if (!strcasecmp(iri_schemes[it], p)) {
				iri->scheme = iri_schemes[it];
				default_port = iri_ports[it];
				break;
			}
		}

		if (iri->scheme == p) {
			// convert scheme to lowercase
			for (; *p; p++)
				if (isupper(*p))
					*p = tolower(*p);
		}

	} else {
		iri->scheme = IRI_SCHEME_DEFAULT;
		default_port = iri_ports[0]; // port 80
		s = p; // rewind
	}

	// this is true for http, https, ftp, file
	if (s[0] == '/' && s[1] == '/')
		s += 2;

	// authority
	authority = s;
	while (*s && *s != '/' && *s != '?' && *s != '#')
		s++;
	c = *s;
	if (c) *s++ = 0;

	// left over: [path][?query][#fragment]
	if (c == '/') {
		iri->path = s;
		while (*s && *s != '?' && *s != '#')
			s++;
		c = *s;
		if (c) *s++ = 0;
	}

	if (c == '?') {
		iri->query = s;
		while (*s && *s != '#')
			s++;
		c = *s;
		if (c) *s++ = 0;
	}

	if (c == '#') {
		iri->fragment = s;
		while (*s)
			s++;
	}

	if (*s) {
		debug_printf("unparsed rest '%s'\n", s);
	}

	if (*authority) {
		s = authority;
		p = strchr(authority, '@');
		if (p) {
			iri->userinfo = s;
			*p = 0;
			s = p + 1;
		}
		if (*s == '[') {
			p = strrchr(s, ']');
			if (p) {
				iri->host = s + 1;
				*p = 0;
				s = p + 1;
			} else {
				// something is broken
				iri->host = s + 1;
				while (*s) s++;
			}
		} else {
			iri->host = s;
			while (*s && *s != ':')
				s++;
		}
		if (*s == ':') {
			if (s[1]) {
				if (!default_port || (strcmp(s + 1, default_port) && atoi(s + 1) != atoi(default_port)))
					iri->port = s + 1;
			}
		}
		*s = 0;
/*
		for (p = (char *)iri->host; *p; p++)
			if (*p >= 'A' && *p <= 'Z') // isupper() also returns true for chars > 0x7f, the test is not EBCDIC compatible ;-)
				*p = tolower(*p);
*/
 	}

	iri->resolv_port = iri->port ? iri->port : default_port;

	// now unescape all components (not interested in display, userinfo, password
	if (iri->host) {
		const char *host_utf;
		char *p;

		_unescape((unsigned char *)iri->host);

		host_utf = mget_str_to_utf8(iri->host, encoding);

		if (host_utf) {
			char *host_asc = NULL;
			int rc;

			if ((rc = idna_to_ascii_8z(host_utf, &host_asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
				// log_printf("toASCII '%s' -> '%s'\n", host_utf, host_asc);
				iri->host = host_asc;
				iri->host_allocated = 1;
			} else
				error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc));

			xfree(host_utf);
		}

		for (p = (char *)iri->host; *p; p++)
			if (*p >= 'A' && *p <= 'Z') // isupper() also returns true for chars > 0x7f, the test is not EBCDIC compatible ;-)
				*p = tolower(*p);
	}
	else {
		if (iri->scheme == IRI_SCHEME_HTTP || iri->scheme == IRI_SCHEME_HTTPS) {
			error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri);
			mget_iri_free(&iri);
			return NULL;
		}
	}
	if (iri->path)
		_unescape((unsigned char *)iri->path);
	if (iri->query)
		_unescape((unsigned char *)iri->query);
	if (iri->fragment)
		_unescape((unsigned char *)iri->fragment);

//	info_printf("%s: path '%s'\n", iri->uri, iri->path);

	return iri;
}