Beispiel #1
0
/* allocates memory */
static int
cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
{
	char *begin = URL->data;
	const char *end;
	size_t len;

	clear_msb(begin);
	/*if(begin == NULL)
		return;*/
	/*TODO: handle hex-encoded IPs*/
	while(isspace(*begin))
		begin++;

	len = strlen(begin);
	if(len == 0) {
		string_assign_null(URL);
		string_assign_null(pre_URL);
		return 0;
	}

	end = begin + len - 1;
	/*cli_dbgmsg("%d %d\n", end-begin, len);*/
	if(begin >= end) {
		string_assign_null(URL);
		string_assign_null(pre_URL);
		return 0;
	}
	while(isspace(*end))
		end--;
	/* From mailscanner, my comments enclosed in {} */
	if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) {
		string_assign_null(URL);
		string_assign_null(pre_URL);
	}
	else {
		size_t host_len;
		char* host_begin;
		int rc;

		str_replace(begin,end,'\\','/');
		/* find beginning of hostname, because:
		 * - we want to keep only protocol, host, and 
		 *  strip path & query parameter(s) 
		 * - we want to make hostname lowercase*/
		host_begin = strchr(begin,':');
		while(host_begin && (host_begin < end) && (host_begin[1] == '/'))  host_begin++;
		if(!host_begin) host_begin=begin;
		else host_begin++;
		host_len = strcspn(host_begin,":/?");
	        if(host_begin + host_len > end + 1) {
			/* prevent hostname extending beyond end, it can happen
			 * if we have spaces at the end, we don't want those part of 
			 * the hostname */
			host_len = end - host_begin + 1;
		} else {
			/* cut the URL after the hostname */
			/* @end points to last character we want to be part of the URL */
			end = host_begin + host_len - 1;
		}
		host_begin[host_len] = '\0';
		/* convert hostname to lowercase, but only hostname! */
		str_make_lowercase(host_begin, host_len);
		/* some broken MUAs put > in the href, and then
		 * we get a false positive, so remove them */
		str_replace(begin,end,'<',' ');
		str_replace(begin,end,'>',' ');
		str_replace(begin,end,'\"',' ');
		str_replace(begin,end,';',' ');
		str_strip(&begin,&end,lt,lt_len);
		str_strip(&begin,&end,gt,gt_len);
		/* convert %xx to real value */
		str_hex_to_char(&begin,&end);
		if(isReal) {
			/* htmlnorm converts \n to space, so we have to strip spaces */
			str_strip(&begin, &end, " ", 1);
		}
		else {
			/* trim space */
			while((begin <= end) && (begin[0]==' '))  begin++;
			while((begin <= end) && (end[0]==' ')) end--;
		}
		if (( rc = string_assign_dup(isReal ? URL : pre_URL,begin,end+1) )) {
			string_assign_null(URL);
			return rc;
		}
		if(!isReal) {
			str_fixup_spaces(&begin,&end);
			if (( rc = string_assign_dup(URL, begin, end+1) )) {
				return rc;
			}
		}
	}
	return 0;
}
/* allocates memory */
static int
cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
{
	char *begin = URL->data;
	const char *end;
	size_t len;
	
	clear_msb(begin);
	/*if(begin == NULL)
		return;*/
	/*TODO: handle hex-encoded IPs*/
	while(isspace(*begin))
		begin++;

	len = strlen(begin);
	if(len == 0) {
		string_assign_null(URL);
		string_assign_null(pre_URL);
		return 0;
	}

	end = begin + len - 1;
	/*cli_dbgmsg("%d %d\n", end-begin, len);*/
	if(begin >= end) {
		string_assign_null(URL);
		string_assign_null(pre_URL);
		return 0;
	}
	while(isspace(*end))
		end--;
	/*TODO: convert \ to /, and stuff like that*/
	/* From mailscanner, my comments enclosed in {} */
	if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) {
		string_assign_null(URL);
		string_assign_null(pre_URL);
	}
	else {
		size_t host_len;
		char* host_begin;
		int rc;

		str_replace(begin,end,'\\','/');
		/* some broken MUAs put > in the href, and then
		 * we get a false positive, so remove them */
		str_replace(begin,end,'<',' ');
		str_replace(begin,end,'>',' ');
		str_replace(begin,end,'\"',' ');
		str_replace(begin,end,';',' ');
		str_strip(&begin,&end,lt,lt_len);
		str_strip(&begin,&end,gt,gt_len);
		/* convert hostname to lowercase, but only hostname! */
		host_begin = strchr(begin,':');
		while(host_begin && host_begin[1]=='/') host_begin++;
		if(!host_begin) host_begin=begin;
		else host_begin++;
		host_len = strcspn(host_begin,"/?");
		str_make_lowercase(host_begin,host_len);
		/* convert %xx to real value */
		str_hex_to_char(&begin,&end);
		if(isReal) {
			/* htmlnorm converts \n to space, so we have to strip spaces */
			str_strip(&begin, &end, " ", 1);
		}
		else {
			/* trim space */
			while((begin <= end) && (begin[0]==' '))  begin++;
			while((begin <= end) && (end[0]==' ')) end--;
		}
		if (( rc = string_assign_dup(isReal ? URL : pre_URL,begin,end+1) )) {
			string_assign_null(URL);
			return rc;
		}
		if(!isReal) {
			str_fixup_spaces(&begin,&end);
			if (( rc = string_assign_dup(URL,begin,end+1) )) {
				return rc;
			}
		}
		/*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/
	}
	return 0;
}
Beispiel #3
0
int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, char **host, size_t *hostlen, const char **path, size_t *pathlen)
{
	char *url, *p, *last;
	char *host_begin, *path_begin;
	const char *urlend = urlbuff + len;
	size_t host_len, path_len;

	dest_len -= 3;
	strncpy(urlbuff, inurl, dest_len);
	urlbuff[dest_len] = urlbuff[dest_len+1] = urlbuff[dest_len+2] = '\0';
	url = urlbuff;

	/* canonicalize only real URLs, with a protocol */
	host_begin = strchr(url, ':');
	if(!host_begin)
		return CL_PHISH_CLEAN;
	++host_begin;

	/* ignore username in URL */
	p = strchr(host_begin, '@');
	if (p)
	    host_begin = p+1;
	url = host_begin;
	/* repeatedly % unescape characters */
	str_hex_to_char(&url, &urlend);
	host_begin = url;
	len = urlend - url;
	/* skip to beginning of hostname */
	while((host_begin < urlend) && *host_begin == '/') ++host_begin;
	while(*host_begin == '.' && host_begin < urlend) ++host_begin;

	last = strchr(host_begin, '/');
	p = host_begin;
	while (p < urlend) {
	    if (p+2 < urlend && *p == '/' && p[1] == '.' ) {
		if (p[2] == '/') {
		    /* remove /./ */
		    if (p + 3 < urlend)
			memmove(p+1, p+3, urlend - p - 3);
		    urlend -= 2;
		}
		else if (p[2] == '.' && (p[3] == '/' || p[3] == '\0') && last) {
		    /* remove /component/../ */
		    if (p+4 < urlend)
			memmove(last+1, p+4, urlend - p - 4);
		    urlend -= 3 + (p - last);
		}
	    }
	    if (*p == '/')
		last = p;
	    p++;
	}
	p = &url[urlend - url];
	*p = '\0';

	p = host_begin;
	while (p < urlend && p+2 < url + dest_len && urlend < urlbuff+dest_len) {
	    unsigned char c = *p;
	    if (c <= 32 || c >= 127 || c == '%' || c == '#') {
		/* convert non-ascii characters back to % escaped */
		const char hexchars[] = "0123456789ABCDEF";
		memmove(p+3, p+1, urlend - p - 1);
		*p++ = '%';
		*p++ = hexchars[c>>4];
		*p = hexchars[c&0xf];
		urlend += 2;
	    }
	    p++;
	}
	*p = '\0';
	urlend = p;
	len = urlend - url;
	/* determine end of hostname */
	host_len = strcspn(host_begin, ":/?");
	path_begin = host_begin + host_len;
	if(host_len <= len) {
		/* url without path, use a single / */
		memmove(path_begin + 2, path_begin + 1, len - host_len);
		*path_begin++ = '/';
		*path_begin++ = '\0';
	} else path_begin = url+len;
	if(url + len >= path_begin) {
		path_len = url + len - path_begin + 1;
		p = strchr(path_begin, '#');
		if (p) {
		    /* ignore anchor */
		    *p = '\0';
		    path_len = p - path_begin;
		}
		*path = path_begin;
	} else {
		path_len = 0;
		*path = "";
	}
	/* lowercase entire URL */
	str_make_lowercase(host_begin, host_len);
	*host = host_begin;
	*hostlen = host_len;
	*pathlen = path_len;
	return CL_PHISH_NODECISION;
}