/* * === FUNCTION ====================================================================== * Name: is_reserved * Description: Reserved characters can only be used in certain parts of the URI * string. * ===================================================================================== */ static bool is_reserved( char c ) { bool rv = false; if( is_gen_delim(c) || is_sub_delim(c) ) { rv = true; } return rv; }
/* * === FUNCTION ====================================================================== * Name: is_pchar * Description: Test if a character is a pchar as desribed by RFC3986. A pchar * is: unreserved / pct-encoded / sub-delims / ":" / "@", if character * is a pchar return true otherwise return false. * ===================================================================================== */ static bool is_pchar( char c) { bool result = false; if( is_unreserved(c) || is_sub_delim(c) ){ result = true; } else if( c == ':' || c == '@'){ result = true; } return result; }
/* * === FUNCTION ====================================================================== * Name: uri_norm_host * Description: Normalize the host section of the URI if it is set. The host section * that is normalized here is the reg-name, and not the IP address. * The IP address should be inside the IP section of the structure. * * This section attempts to implement section 3.2.2 of RFC3986 with the * exception that userinfo is not checked as HTTP and HTTPS do not * specify the usage of it. * ===================================================================================== */ extern int uri_norm_host( uriobj_t *uri) { int err = 0, i, len; char *pct = (char *) malloc(4), *host = URI_CP_PT(uri->uri_host); if( host ) { len = strlen(host); for(i = 0;i < len; i ++ ) { if( host[i] == '%'){ if( (i + 2) > len ) { err = EILSEQ; break; } pct[0] = host[i ++]; pct[1] = host[i ++]; pct[2] = host[i ++]; pct[3] = '\0'; err = norm_pct(&pct); if( err ) { break; } host[(i - 3)] = pct[0]; host[(i - 2)] = pct[1]; host[(i - 1)] = pct[2]; continue; } if(!(is_sub_delim(host[i]) || is_unreserved(host[i]))){ err = EILSEQ; break; } if( isalpha(host[i]) && isupper(host[i]) ){ host[i] = tolower(host[i]); } } if( ! err ) { *(uri->uri_host) = NULL; free(*uri->uri_host); *(uri->uri_host) = host; } } return err; }
/** Built-in preprocessing callback * * Built-in preprocessing callback to break or not to break URLs according to * some rules by Chicago Manual of Style 15th ed. * If data is NULL, prohibit break. * Otherwise, allow break by rule above. */ gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data, unistr_t * str, unistr_t * text) { gcstring_t *gcstr; size_t i; unichar_t *ptr; /* Pass I */ if (text != NULL) { /* * Search URL in str. * Following code loosely refers RFC3986 but some practical * assumptions are put: * * o Broken pct-encoded sequences (e.g. single "%") are allowed. * o scheme names must end with alphanumeric, must be longer than * or equal to two octets, and must not contain more than one * non-alphanumeric ("+", "-" or "."). * o URLs containing neither non-empty path, query part nor fragment * (e.g. "about:") are omitted: they are treated as ordinal words. */ for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) { int has_double_slash, has_authority, has_empty_path, has_no_query, has_no_fragment; size_t alphadigit, nonalphadigit; /* skip non-alpha. */ if (!is_alpha(str, i)) continue; ptr = str->str + i; /* "url:" - case insensitive */ if (startswith(str, i, "url:", 4, 0)) i += 4; /* scheme */ if (is_alpha(str, i)) i++; else continue; nonalphadigit = 0; alphadigit = 1; while (1) { if (is_alpha(str, i) || is_digit(str, i)) alphadigit++; else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.')) nonalphadigit++; else break; i++; } if (alphadigit < 2 || 1 < nonalphadigit || ! (is_digit(str, i - 1) || is_alpha(str, i - 1))) continue; /* ":" */ if (is(str, i, ':')) i++; else continue; /* hier-part */ has_double_slash = 0; has_authority = 0; has_empty_path = 0; has_no_query = 0; has_no_fragment = 0; if (startswith(str, i, "//", 2, 0)) { /* "//" */ has_double_slash = 1; i += 2; /* authority - FIXME:syntax relaxed */ if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') || is_unreserved(str, i) || is_pct_encoded(str, i) || is_sub_delim(str, i)) { has_authority = 1; i++; while (is(str, i, '[') || is(str, i, ']') || is(str, i, ':') || is(str, i, '@') || is_unreserved(str, i) || is_pct_encoded(str, i) || is_sub_delim(str, i)) i++; } } /* path */ if (has_double_slash) { if (has_authority) goto path_abempty; else goto path_absolute; } /* else goto path_rootless; */ /* path_rootless: */ if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */ i++; while (is_pchar(str, i)) i++; goto path_abempty; } else { has_empty_path = 1; goto path_empty; } path_absolute: if (startswith(str, i, "//", 2, 0)) continue; else if (is(str, i, '/')) { i++; if (is_pchar(str, i)) { i++; while (is_pchar(str, i)) i++; } goto path_abempty; } else continue; path_abempty: if (is(str, i, '/')) { i++; while (is(str, i, '/') || is_pchar(str, i)) i++; } /* else goto path_empty; */ path_empty: ; /* query */ if (is(str, i, '?')) { i++; while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i)) i++; } else has_no_query = 1; /* fragment */ if (is(str, i, '#')) { i++; while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i)) i++; } else has_no_fragment = 1; if (has_empty_path && has_no_query && has_no_fragment) continue; break; } if (ptr != NULL) str->len = i - (ptr - str->str); str->str = ptr; return NULL; } /* Pass II */ if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) { lbobj->errnum = errno ? errno : ENOMEM; return NULL; } /* non-break URI. */ if (data == NULL) { for (i = 1; i < gcstr->gclen; i++) gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; return gcstr; } /* break URI. */ if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) { gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE; i = 5; } else i = 1; for (; i < gcstr->gclen; i++) { unichar_t u, v; u = gcstr->str[gcstr->gcstr[i - 1].idx]; v = gcstr->str[gcstr->gcstr[i].idx]; /* * Some rules based on CMoS 15th ed. * 17.11 1.1: [/] ÷ [^/] * 17.11 2: [-] × * 6.17 2: [.] × * 17.11 1.2: ÷ [-~.,_?#%] * 17.11 1.3: ÷ [=&] * 17.11 1.3: [=&] ÷ * Default: ALL × ALL */ if (u == '/' && v != '/') gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE; else if (u == '-' || u == '.') gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; else if (v == '-' || v == '~' || v == '.' || v == ',' || v == '_' || v == '?' || v == '#' || v == '%' || u == '=' || v == '=' || u == '&' || v == '&') gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE; else gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; } /* Won't break punctuations at end of matches. */ for (i = gcstr->gclen - 1; 1 <= i; i--) { unichar_t u = gcstr->str[gcstr->gcstr[i].idx]; if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE && (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' || u == '>')) gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; else break; } return gcstr; }