static char * uri_normalized_copy (const char *part, int length, const char *unescape_extra) { unsigned char *s, *d, c; char *normalized = g_strndup (part, length); gboolean need_fixup = FALSE; s = d = (unsigned char *)normalized; do { if (*s == '%') { if (!g_ascii_isxdigit (s[1]) || !g_ascii_isxdigit (s[2])) { *d++ = *s; continue; } c = HEXCHAR (s); if (soup_char_is_uri_unreserved (c) || (unescape_extra && strchr (unescape_extra, c))) { *d++ = c; s += 2; } else { /* We leave it unchanged. We used to uppercase percent-encoded * triplets but we do not do it any more as RFC3986 Section 6.2.2.1 * says that they only SHOULD be case normalized. */ *d++ = *s++; *d++ = *s++; *d++ = *s; } } else { if (!g_ascii_isgraph (*s)) need_fixup = TRUE; *d++ = *s; } } while (*s++); if (need_fixup) { GString *fixed; fixed = g_string_new (NULL); s = (guchar *)normalized; while (*s) { if (g_ascii_isgraph (*s)) g_string_append_c (fixed, *s); else g_string_append_printf (fixed, "%%%02X", (int)*s); s++; } g_free (normalized); normalized = g_string_free (fixed, FALSE); } return normalized; }
static gboolean parse_parameters (const gchar *argument, gint index, gint *parsed_position, GHashTable **parameters, GError **error) { gint local_index = 0; while (argument[index + local_index] == ' ') { gint i, current_argument_index; gint keyword_length, value_length = 0; const gchar *keyword, *value = NULL; current_argument_index = index + local_index + 1; keyword = argument + current_argument_index; if (!keyword[0]) RETURN_ERROR_WITH_POSITION("parameter keyword is missing", argument, current_argument_index); if (!g_ascii_isalnum(keyword[0])) RETURN_ERROR_WITH_POSITION("parameter keyword should start with " "alphabet or digit", argument, current_argument_index); i = 1; while (g_ascii_isalnum(keyword[i]) || keyword[i] == '-') { i++; } keyword_length = i; if (keyword[i] == '=') { gint j = 0; value = keyword + i + 1; while (g_ascii_isgraph(value[j]) && value[j] != '=') { j++; } value_length = j; i += 1 + j; } if (parameters) { if (!*parameters) *parameters = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, g_free); g_hash_table_insert(*parameters, g_strndup(keyword, keyword_length), value ? g_strndup(value, value_length) : NULL); } local_index += i + 1; } *parsed_position = local_index; return TRUE; }
static gboolean isvalid_rfc2428_delimiter(const guchar c) { /* RFC2428 sect. 2 states rules for a valid delimiter */ const gchar *forbidden = "0123456789abcdef.:"; if (!g_ascii_isgraph(c)) return FALSE; if (strchr(forbidden, g_ascii_tolower(c))) return FALSE; return TRUE; }
static gboolean parse_domain (const gchar *argument, gint index, gint *parsed_position, GError **error) { gint i; const gchar *domain; domain = argument + index; if (domain[0] == '[') { i = 1; while (TRUE) { if (domain[i] == '[' || domain[i] == ']') { break; } else if (domain[i] == '\\') { i++; if (IS_TEXT(domain[i])) { i++; } else { RETURN_ERROR_WITH_POSITION("invalid quoted character " "in domain", argument, index + i); } } else if (g_ascii_isspace(domain[i])) { break; } else if (g_ascii_iscntrl(domain[i]) || g_ascii_isgraph(domain[i])) { i++; } else { break; } } if (domain[i] != ']') RETURN_ERROR_WITH_POSITION("terminate ']' is missing in domain", argument, index + i); i++; } else { i = 0; if (!g_ascii_isalnum(domain[i])) RETURN_ERROR_WITH_POSITION("domain should start with " "alphabet or digit", argument, index + i); do { i++; while (g_ascii_isalnum(domain[i]) || domain[i] == '-') { i++; } } while (domain[i] == '.'); } *parsed_position = i; return TRUE; }
static gboolean parse_local_part (const gchar *argument, gint index, gint *parsed_position, GError **error) { gint i; const gchar *local_part; local_part = argument + index; if (local_part[0] == '"') { i = 1; while (TRUE) { if (local_part[i] == '\\') { i++; if (IS_TEXT(local_part[i])) { i++; } else { RETURN_ERROR_WITH_POSITION("invalid quoted character " "in local part", argument, index + i); } } else if (local_part[i] == '"') { break; } else if (g_ascii_isspace(local_part[i])) { break; } else if (g_ascii_iscntrl(local_part[i]) || g_ascii_isgraph(local_part[i])) { i++; } else { break; } } if (local_part[i] != '"') RETURN_ERROR_WITH_POSITION("end quote for local part is missing", argument, index + i); i++; } else { i = -1; do { i++; while (IS_ATOM_TEXT(local_part[i])) { i++; } } while (local_part[i] == '.'); } *parsed_position = i; return TRUE; }
gboolean file_utils_filename_is_uri (const gchar *filename, GError **error) { g_return_val_if_fail (filename != NULL, FALSE); g_return_val_if_fail (error == NULL || *error == NULL, FALSE); if (strstr (filename, "://")) { gchar *scheme; gchar *canon; scheme = g_strndup (filename, (strstr (filename, "://") - filename)); canon = g_strdup (scheme); g_strcanon (canon, G_CSET_A_2_Z G_CSET_a_2_z G_CSET_DIGITS "+-.", '-'); if (strcmp (scheme, canon) || ! g_ascii_isgraph (canon[0])) { g_set_error (error, G_FILE_ERROR, 0, _("'%s:' is not a valid URI scheme"), scheme); g_free (scheme); g_free (canon); return FALSE; } g_free (scheme); g_free (canon); if (! g_utf8_validate (filename, -1, NULL)) { g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, _("Invalid character sequence in URI")); return FALSE; } return TRUE; } return FALSE; }
struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, struct html_tag_component *comp) { struct rspamd_url *url; gchar *decoded; gint rc; gsize decoded_len; const gchar *p, *s; gchar *d; guint i, dlen; gboolean has_bad_chars = FALSE; static const gchar hexdigests[16] = "0123456789abcdef"; p = start; /* Strip spaces from the url */ /* Head spaces */ while (g_ascii_isspace (*p) && p < start + len) { p ++; start ++; len --; } if (comp) { comp->start = p; comp->len = len; } /* Trailing spaces */ p = start + len - 1; while (g_ascii_isspace (*p) && p >= start) { p --; len --; if (comp) { comp->len --; } } s = start; dlen = 0; for (i = 0; i < len; i ++) { if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) { dlen += 3; } else { dlen ++; } } decoded = rspamd_mempool_alloc (pool, dlen + 1); d = decoded; /* We also need to remove all internal newlines and encode unsafe characters */ for (i = 0; i < len; i ++) { if (G_UNLIKELY (s[i] == '\r' || s[i] == '\n')) { continue; } else if (G_UNLIKELY (((guint)s[i]) < 0x80 && !g_ascii_isgraph (s[i]))) { /* URL encode */ *d++ = '%'; *d++ = hexdigests[(s[i] >> 4) & 0xf]; *d++ = hexdigests[s[i] & 0xf]; has_bad_chars = TRUE; } else {