/* * === FUNCTION ====================================================================== * Name: uri_norm_path * Description: Normalize the path, with this section just check for illegal * characters. upper and lower case letters are aloud. * ===================================================================================== */ extern int uri_norm_path( uriobj_t *uri) { int err = 0, len = 0, i = 0, n = 0; char *path = URI_CP_PT(uri->uri_path), *ou, *pct; if( !path){ return EINVAL; } len = strlen(path); ou = (char *) malloc(len + 1); pct = (char *) malloc(4); if( errno ){ return errno; } for(;i < len; i ++){ if( is_pchar(path[i]) || path[i] == '/'){ ou[n ++] = path[i]; } else if(path[i] == '%'){ if( (i + 2) > len){ err = EILSEQ; break; } pct[0] = path[i ++]; pct[1] = path[i ++]; pct[2] = path[i ++]; pct[3] = '\0'; err = norm_pct(&pct); if( err ){ break; } ou[n ++] = pct[0]; ou[n ++] = pct[1]; ou[n ++] = pct[2]; } else { err = EILSEQ; break; } } ou[n] = '\0'; free(*uri->uri_path); *uri->uri_path = strdup(ou); return err; }
/** Built-in preprocessing callback * * Built-in preprocessing callback to break or not to break URLs according to * some rules by Chicago Manual of Style 15th ed. * If data is NULL, prohibit break. * Otherwise, allow break by rule above. */ gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data, unistr_t * str, unistr_t * text) { gcstring_t *gcstr; size_t i; unichar_t *ptr; /* Pass I */ if (text != NULL) { /* * Search URL in str. * Following code loosely refers RFC3986 but some practical * assumptions are put: * * o Broken pct-encoded sequences (e.g. single "%") are allowed. * o scheme names must end with alphanumeric, must be longer than * or equal to two octets, and must not contain more than one * non-alphanumeric ("+", "-" or "."). * o URLs containing neither non-empty path, query part nor fragment * (e.g. "about:") are omitted: they are treated as ordinal words. */ for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) { int has_double_slash, has_authority, has_empty_path, has_no_query, has_no_fragment; size_t alphadigit, nonalphadigit; /* skip non-alpha. */ if (!is_alpha(str, i)) continue; ptr = str->str + i; /* "url:" - case insensitive */ if (startswith(str, i, "url:", 4, 0)) i += 4; /* scheme */ if (is_alpha(str, i)) i++; else continue; nonalphadigit = 0; alphadigit = 1; while (1) { if (is_alpha(str, i) || is_digit(str, i)) alphadigit++; else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.')) nonalphadigit++; else break; i++; } if (alphadigit < 2 || 1 < nonalphadigit || ! (is_digit(str, i - 1) || is_alpha(str, i - 1))) continue; /* ":" */ if (is(str, i, ':')) i++; else continue; /* hier-part */ has_double_slash = 0; has_authority = 0; has_empty_path = 0; has_no_query = 0; has_no_fragment = 0; if (startswith(str, i, "//", 2, 0)) { /* "//" */ has_double_slash = 1; i += 2; /* authority - FIXME:syntax relaxed */ if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') || is_unreserved(str, i) || is_pct_encoded(str, i) || is_sub_delim(str, i)) { has_authority = 1; i++; while (is(str, i, '[') || is(str, i, ']') || is(str, i, ':') || is(str, i, '@') || is_unreserved(str, i) || is_pct_encoded(str, i) || is_sub_delim(str, i)) i++; } } /* path */ if (has_double_slash) { if (has_authority) goto path_abempty; else goto path_absolute; } /* else goto path_rootless; */ /* path_rootless: */ if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */ i++; while (is_pchar(str, i)) i++; goto path_abempty; } else { has_empty_path = 1; goto path_empty; } path_absolute: if (startswith(str, i, "//", 2, 0)) continue; else if (is(str, i, '/')) { i++; if (is_pchar(str, i)) { i++; while (is_pchar(str, i)) i++; } goto path_abempty; } else continue; path_abempty: if (is(str, i, '/')) { i++; while (is(str, i, '/') || is_pchar(str, i)) i++; } /* else goto path_empty; */ path_empty: ; /* query */ if (is(str, i, '?')) { i++; while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i)) i++; } else has_no_query = 1; /* fragment */ if (is(str, i, '#')) { i++; while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i)) i++; } else has_no_fragment = 1; if (has_empty_path && has_no_query && has_no_fragment) continue; break; } if (ptr != NULL) str->len = i - (ptr - str->str); str->str = ptr; return NULL; } /* Pass II */ if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) { lbobj->errnum = errno ? errno : ENOMEM; return NULL; } /* non-break URI. */ if (data == NULL) { for (i = 1; i < gcstr->gclen; i++) gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; return gcstr; } /* break URI. */ if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) { gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE; i = 5; } else i = 1; for (; i < gcstr->gclen; i++) { unichar_t u, v; u = gcstr->str[gcstr->gcstr[i - 1].idx]; v = gcstr->str[gcstr->gcstr[i].idx]; /* * Some rules based on CMoS 15th ed. * 17.11 1.1: [/] ÷ [^/] * 17.11 2: [-] × * 6.17 2: [.] × * 17.11 1.2: ÷ [-~.,_?#%] * 17.11 1.3: ÷ [=&] * 17.11 1.3: [=&] ÷ * Default: ALL × ALL */ if (u == '/' && v != '/') gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE; else if (u == '-' || u == '.') gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; else if (v == '-' || v == '~' || v == '.' || v == ',' || v == '_' || v == '?' || v == '#' || v == '%' || u == '=' || v == '=' || u == '&' || v == '&') gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE; else gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; } /* Won't break punctuations at end of matches. */ for (i = gcstr->gclen - 1; 1 <= i; i--) { unichar_t u = gcstr->str[gcstr->gcstr[i].idx]; if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE && (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' || u == '>')) gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; else break; } return gcstr; }