/* allocates memory */ static int cleanupURL(struct string *URL,struct string *pre_URL, int isReal) { char *begin = URL->data; const char *end; size_t len; clear_msb(begin); /*if(begin == NULL) return;*/ /*TODO: handle hex-encoded IPs*/ while(isspace(*begin)) begin++; len = strlen(begin); if(len == 0) { string_assign_null(URL); string_assign_null(pre_URL); return 0; } end = begin + len - 1; /*cli_dbgmsg("%d %d\n", end-begin, len);*/ if(begin >= end) { string_assign_null(URL); string_assign_null(pre_URL); return 0; } while(isspace(*end)) end--; /*TODO: convert \ to /, and stuff like that*/ /* From mailscanner, my comments enclosed in {} */ if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) { string_assign_null(URL); string_assign_null(pre_URL); } else { size_t host_len; char* host_begin; int rc; str_replace(begin,end,'\\','/'); /* some broken MUAs put > in the href, and then * we get a false positive, so remove them */ str_replace(begin,end,'<',' '); str_replace(begin,end,'>',' '); str_replace(begin,end,'\"',' '); str_replace(begin,end,';',' '); str_strip(&begin,&end,lt,lt_len); str_strip(&begin,&end,gt,gt_len); /* convert hostname to lowercase, but only hostname! */ host_begin = strchr(begin,':'); while(host_begin && host_begin[1]=='/') host_begin++; if(!host_begin) host_begin=begin; else host_begin++; host_len = strcspn(host_begin,"/?"); str_make_lowercase(host_begin,host_len); /* convert %xx to real value */ str_hex_to_char(&begin,&end); if(isReal) { /* htmlnorm converts \n to space, so we have to strip spaces */ str_strip(&begin, &end, " ", 1); } else { /* trim space */ while((begin <= end) && (begin[0]==' ')) begin++; while((begin <= end) && (end[0]==' ')) end--; } if (( rc = string_assign_dup(isReal ? URL : pre_URL,begin,end+1) )) { string_assign_null(URL); return rc; } if(!isReal) { str_fixup_spaces(&begin,&end); if (( rc = string_assign_dup(URL,begin,end+1) )) { return rc; } } /*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/ } return 0; }
/* allocates memory */ static int cleanupURL(struct string *URL,struct string *pre_URL, int isReal) { char *begin = URL->data; const char *end; size_t len; clear_msb(begin); /*if(begin == NULL) return;*/ /*TODO: handle hex-encoded IPs*/ while(isspace(*begin)) begin++; len = strlen(begin); if(len == 0) { string_assign_null(URL); string_assign_null(pre_URL); return 0; } end = begin + len - 1; /*cli_dbgmsg("%d %d\n", end-begin, len);*/ if(begin >= end) { string_assign_null(URL); string_assign_null(pre_URL); return 0; } while(isspace(*end)) end--; /* From mailscanner, my comments enclosed in {} */ if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) { string_assign_null(URL); string_assign_null(pre_URL); } else { size_t host_len; char* host_begin; int rc; str_replace(begin,end,'\\','/'); /* find beginning of hostname, because: * - we want to keep only protocol, host, and * strip path & query parameter(s) * - we want to make hostname lowercase*/ host_begin = strchr(begin,':'); while(host_begin && (host_begin < end) && (host_begin[1] == '/')) host_begin++; if(!host_begin) host_begin=begin; else host_begin++; host_len = strcspn(host_begin,":/?"); if(host_begin + host_len > end + 1) { /* prevent hostname extending beyond end, it can happen * if we have spaces at the end, we don't want those part of * the hostname */ host_len = end - host_begin + 1; } else { /* cut the URL after the hostname */ /* @end points to last character we want to be part of the URL */ end = host_begin + host_len - 1; } host_begin[host_len] = '\0'; /* convert hostname to lowercase, but only hostname! */ str_make_lowercase(host_begin, host_len); /* some broken MUAs put > in the href, and then * we get a false positive, so remove them */ str_replace(begin,end,'<',' '); str_replace(begin,end,'>',' '); str_replace(begin,end,'\"',' '); str_replace(begin,end,';',' '); str_strip(&begin,&end,lt,lt_len); str_strip(&begin,&end,gt,gt_len); /* convert %xx to real value */ str_hex_to_char(&begin,&end); if(isReal) { /* htmlnorm converts \n to space, so we have to strip spaces */ str_strip(&begin, &end, " ", 1); } else { /* trim space */ while((begin <= end) && (begin[0]==' ')) begin++; while((begin <= end) && (end[0]==' ')) end--; } if (( rc = string_assign_dup(isReal ? URL : pre_URL,begin,end+1) )) { string_assign_null(URL); return rc; } if(!isReal) { str_fixup_spaces(&begin,&end); if (( rc = string_assign_dup(URL, begin, end+1) )) { return rc; } } } return 0; }