/* * === FUNCTION ====================================================================== * Name: is_pchar * Description: Test if a character is a pchar as desribed by RFC3986. A pchar * is: unreserved / pct-encoded / sub-delims / ":" / "@", if character * is a pchar return true otherwise return false. * ===================================================================================== */ static bool is_pchar( char c) { bool result = false; if( is_unreserved(c) || is_sub_delim(c) ){ result = true; } else if( c == ':' || c == '@'){ result = true; } return result; }
static bool is_password(char c) { switch (c) { case '&': case '=': case '+': case '$': case ',': return true; default: return is_unreserved(c); } }
/************************************************************************ * Function : parse_uric * * Parameters : * char *in ; string of characters * int max ; maximum limit * token *out ; token object where the string of characters is * copied * * Description : Parses a string of uric characters starting at in[0] * as defined in http://www.ietf.org/rfc/rfc2396.txt (RFC explaining * URIs) * * Return : int ; * * Note : ************************************************************************/ int parse_uric( char *in, int max, token * out ) { int i = 0; while( ( i < max ) && ( ( is_unreserved( in[i] ) ) || ( is_reserved( in[i] ) ) || ( ( i + 2 < max ) && ( is_escaped( &in[i] ) ) ) ) ) { i++; } out->size = i; out->buff = in; return i; }
std::string uri::encode(const std::experimental::string_view input) { static const std::array<char,16> hex = {{ '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f' }}; std::string res; res.reserve(input.size()); for (const auto chr : input) if (is_unreserved(chr)) { res += chr; } else { res += '%'; res += hex[ chr >> 4 ]; res += hex[ chr & 0xf ]; } return res; }
/* * === FUNCTION ====================================================================== * Name: uri_norm_host * Description: Normalize the host section of the URI if it is set. The host section * that is normalized here is the reg-name, and not the IP address. * The IP address should be inside the IP section of the structure. * * This section attempts to implement section 3.2.2 of RFC3986 with the * exception that userinfo is not checked as HTTP and HTTPS do not * specify the usage of it. * ===================================================================================== */ extern int uri_norm_host( uriobj_t *uri) { int err = 0, i, len; char *pct = (char *) malloc(4), *host = URI_CP_PT(uri->uri_host); if( host ) { len = strlen(host); for(i = 0;i < len; i ++ ) { if( host[i] == '%'){ if( (i + 2) > len ) { err = EILSEQ; break; } pct[0] = host[i ++]; pct[1] = host[i ++]; pct[2] = host[i ++]; pct[3] = '\0'; err = norm_pct(&pct); if( err ) { break; } host[(i - 3)] = pct[0]; host[(i - 2)] = pct[1]; host[(i - 1)] = pct[2]; continue; } if(!(is_sub_delim(host[i]) || is_unreserved(host[i]))){ err = EILSEQ; break; } if( isalpha(host[i]) && isupper(host[i]) ){ host[i] = tolower(host[i]); } } if( ! err ) { *(uri->uri_host) = NULL; free(*uri->uri_host); *(uri->uri_host) = host; } } return err; }
// No special destructor needed, fragment is just char *. A url does not need to have // a fragment, so a NULL return value is not strictly an error. static char *get_fragment(char **s,unsigned int *fragment_out_err) { *fragment_out_err = NO_UPARSE_ERROR; if ((NULL == s) || (0 == strlen(*s))) { return NULL; } char *c = *s; if (FRAGMENT_DELIM != c[0]) { fprintf(stderr,"no %c as prefix\n",FRAGMENT_DELIM); *fragment_out_err = UPARSE_ERROR; return NULL; } c++; size_t fragment_len = strlen(c); if (fragment_len == 0) { // no fragment *fragment_out_err = NO_UPARSE_ERROR; return NULL; } char *fragment = strdup(c); if (NULL == fragment) { fprintf(stderr,"cannot allocate fragment\n"); *fragment_out_err = UPARSE_ERROR; return NULL; } char *test_fragment = fragment; while (*test_fragment) { if (!is_unreserved(*test_fragment)) { fprintf(stderr,"'%c' is invalid\n",*test_fragment); free(fragment); *fragment_out_err = UPARSE_ERROR; return NULL; } test_fragment++; } *fragment_out_err = NO_UPARSE_ERROR; return fragment; }
/*! * \brief Parses a string of uric characters starting at in[0] as defined in * http://www.ietf.org/rfc/rfc2396.txt (RFC explaining URIs). * * \return */ static size_t parse_uric( /*! [in] String of characters. */ const char *in, /*! [in] Maximum limit. */ size_t max, /*! [out] Token object where the string of characters is copied. */ token *out) { size_t i = (size_t)0; while (i < max && (is_unreserved(in[i]) || is_reserved(in[i]) || ((i + (size_t)2 < max) && is_escaped(&in[i])))) { i++; } out->size = i; out->buff = in; return i; }
std::string uri::decode(const std::experimental::string_view input) { std::string res; res.reserve(input.size()); for (auto it = input.cbegin(), e = input.cend(); it not_eq e; ++it) { if (*it == '%') { if (++it >= e) return decode_error(std::move(res)); const uint8_t nibble1 = (*it); if (++it >= e) return decode_error(std::move(res)); const uint8_t nibble2 = (*it); res += static_cast<char>(from_hex(nibble1, nibble2)); } else { if (is_reserved(*it) or is_unreserved(*it)) res += *it; else return decode_error(std::move(res)); } } return res; }
static bool is_hvalue(char c) { return is_hnv_unreserved(c) || is_unreserved(c); }
static bool is_paramchar(char c) { return is_param_unreserved(c) || is_unreserved(c); }
// A url doesn't have to have a ?query arg list, so this can return NULL // and not be an error. static query_arg_list_t *get_query_arg_list(char **s,unsigned int *query_out_err) { if ((NULL == s) || (0 == strlen(*s))) { *query_out_err = NO_UPARSE_ERROR; return NULL; } char *c = *s; *query_out_err = UPARSE_ERROR; // default until end of function if (QUERY_DELIM != c[0]) { fprintf(stderr,"no %c as prefix\n",QUERY_DELIM); return NULL; } c++; if (NULL == c) { fprintf(stderr,"after %c found, no text\n",QUERY_DELIM); return NULL; } // at the end of the loop, t will point to the end of the query string, which // may not be the end of the url; it may have a #fragment char *t = c; unsigned int query_delim_count = 0; unsigned int query_pair_count = 0; unsigned int query_str_len = 0; while ((*t) && (FRAGMENT_DELIM != *t)) { if (QUERY_PAIR_DELIM == *t) { query_delim_count++; } t++; query_str_len++; } query_pair_count = query_delim_count + 1; char query_string[query_str_len+1]; bzero((void *) query_string,(query_str_len+1) * sizeof(char)); strlcpy(query_string,c,query_str_len+1); // stringify the delims for strsep char query_pair_delim_str[2]; snprintf(query_pair_delim_str,2,"%c",QUERY_PAIR_DELIM); char query_key_value_delim_str[2]; snprintf(query_key_value_delim_str,2,"%c",QUERY_KEY_VAL_DELIM); query_key_val_t **query_key_vals = (query_key_val_t **) malloc(query_pair_count * sizeof(query_key_val_t *)); if (NULL == query_key_vals) { fprintf(stderr,"could not allocate query_key_val\n"); return NULL; } bzero((void *) query_key_vals,query_pair_count * sizeof(query_key_val_t *)); unsigned int i = 0; char *pair_tok; char *free_query_string = query_string; while ((pair_tok = strsep(&free_query_string,query_pair_delim_str)) != NULL) { if (0 != strcmp("",pair_tok)) { if (i >= query_pair_count) { fprintf(stderr,"loop count %d >= previous pair count %d\n", i,query_pair_count); free_query_key_val_t_list(query_key_vals,query_pair_count); return NULL; } char *sep_pair_tok = strdup(pair_tok); if (NULL == sep_pair_tok) { fprintf(stderr,"could not allocate sep_pair_tok\n"); free_query_key_val_t_list(query_key_vals,query_pair_count); return NULL; } char *free_sep_pair_tok = sep_pair_tok; // put the key and val in a struct, and set as item i in a list // query_pair_count long (which will need to be allocated above) query_key_val_t *query_key_val_tok = (query_key_val_t *) malloc(sizeof(query_key_val_t)); if (NULL == query_key_val_tok) { fprintf(stderr,"could not allocate query_key_val_tok\n"); free_query_key_val_t_list(query_key_vals,query_pair_count); free(free_sep_pair_tok); return NULL; } bzero((void *) query_key_val_tok,sizeof(query_key_val_t)); bool seen_key = false; char *kv_tok = NULL; char *key = NULL; char *val = NULL; // make sure sep_pair_tok contains an = to delimit the key and value if (NULL == strstr(sep_pair_tok,query_key_value_delim_str)) { fprintf(stderr,"could not find '%s' in query pair '%s'\n", query_key_value_delim_str,sep_pair_tok); free_query_key_val_t_list(query_key_vals,query_pair_count); free_query_key_val_t(query_key_val_tok); free(free_sep_pair_tok); return NULL; } // break sep_pair_tok (a=b) into the key (a) and the value (b) while ((kv_tok = strsep(&sep_pair_tok,query_key_value_delim_str)) != NULL) { if (!seen_key) { key = strdup(kv_tok); seen_key = true; } else { val = strdup(kv_tok); } } free(kv_tok); if ((NULL == key) || (NULL == val)) { fprintf(stderr,"either key or val from %s was null\n",free_sep_pair_tok); free_query_key_val_t_list(query_key_vals,query_pair_count); free_query_key_val_t(query_key_val_tok); free(free_sep_pair_tok); free(key); free(val); return NULL; } char *key_val_list[] = {key,val}; char *test_kv; for (size_t j = 0;j < 2;j++) { test_kv = key_val_list[i]; while (*test_kv) { if (!is_unreserved(*test_kv)) { fprintf(stderr,"'%c' is invalid\n",*test_kv); free_query_key_val_t_list(query_key_vals,query_pair_count); free(free_sep_pair_tok); free_query_key_val_t(query_key_val_tok); free(key); free(val); return NULL; } test_kv++; } } query_key_val_tok->key = key; query_key_val_tok->val = val; query_key_vals[i] = query_key_val_tok; free(free_sep_pair_tok); i++; } } free(pair_tok); query_arg_list_t *query_arg_list = (query_arg_list_t *) malloc(sizeof(query_arg_list_t)); if (NULL == query_arg_list) { fprintf(stderr,"cannot allocate query_arg_list\n"); return NULL; } bzero((void *) query_arg_list,sizeof(query_arg_list_t)); query_arg_list->count = i; query_arg_list->query_key_vals = query_key_vals; *s = t; // pointer is now past query arg str, at # fragment delim if there *query_out_err = NO_UPARSE_ERROR; return query_arg_list; }
static bool is_user(char c) { return is_unreserved(c) || is_user_unreserved(c); }
// A url does not need to have a path, so this can return NULL without an error // being thrown static path_t *get_path(char **s, unsigned int *path_out_err) { if ((NULL == s) || (0 == strlen(*s))) { *path_out_err = NO_UPARSE_ERROR; // not an error: url can have no path return NULL; } *path_out_err = UPARSE_ERROR; // default until end of function size_t delim_count = 0; size_t chars_until_query_delim = 0; char *c = *s; while (*c) { if (PATH_DELIM == *c) { delim_count++; } else if (QUERY_DELIM == *c) { break; } else if (!is_unreserved(*c)) { fprintf(stderr,"'%c' is invalid\n",*c); return NULL; } chars_until_query_delim++; c++; } char *just_path; char path_delim_str[2] = {PATH_DELIM,'\0'}; // "/" (not '/') char nonempty_path[chars_until_query_delim+1]; if (0 == delim_count) { // in this case, the path was '', not even '/'. so we create // a default empty path of '/' to represent what '' implies just_path = (char *) &path_delim_str; } else { bzero((void *) nonempty_path,(chars_until_query_delim + 1) * sizeof(char)); strlcpy(nonempty_path,*s,chars_until_query_delim+1); just_path = (char *) &nonempty_path; } path_t *path = (path_t *) malloc(sizeof(path_t)); if (NULL == path) { fprintf(stderr,"cannot allocate path\n"); return NULL; } bzero((void *) path,sizeof(path_t)); path->path_str = strdup(just_path); // set pointer to where the query delim may be found, now that *s // has been copied into just_path *s = c; // case of single '/' path (like: http://foo.com/?key=val) if (strlen(just_path) == 1) { path->path_elts = (char **) malloc(1 * sizeof(char *)); path->path_elts[0] = (char *) calloc(1,sizeof(char)); path->count = 1; *path_out_err = NO_UPARSE_ERROR; return path; } path->path_elts = (char **) malloc(delim_count * sizeof(char *)); if (NULL == path->path_elts) { fprintf(stderr,"cannot allocate path\n"); return NULL; } path->count = delim_count; char *tok; size_t i = 0; while (((tok = strsep(&just_path,path_delim_str)) != NULL) && (i < delim_count)) { if (0 != strcmp("",tok)) { path->path_elts[i] = strdup(tok); if (NULL == path->path_elts[i]) { fprintf(stderr,"cannot dup %s\n",tok); free_path_t(path); free(tok); return NULL; } i++; } } *path_out_err = NO_UPARSE_ERROR; return path; }
// Get the host:port section of the url. Doesn't support ipv6, unicode hosts, // username annotations etc. // Every url must have a host (but the port is optional). If this returns NULL, // it is an error. static host_port_t *get_host_port(char **s, unsigned int *host_port_out_err) { *host_port_out_err = UPARSE_ERROR; // default if ((NULL == s) || (0 == strlen(*s))) { fprintf(stderr,"arg pointer null\n"); return NULL; } char *c = *s; size_t host_len = 0; size_t port_len = 0; char *host_start = *s; char *port_start = NULL; bool seen_host_delim = false; while (*c) { // normally we would see the PATH_DELIM to break this, but // this can be '', so we must also look out for the // QUERY_DELIM if ((PATH_DELIM == c[0]) || (QUERY_DELIM == c[0])) { break; } else if (HOST_PORT_DELIM == c[0]) { seen_host_delim = true; c++; port_start = c; } else { // *c is part of the host or port, depending on delimiters seen if (!is_unreserved(*c)) { fprintf(stderr,"'%c' is invalid\n",*c); return NULL; } // If we have already seen the host/port delimiter (:), then // count the chars in the length of the port part. Otherwise, // count the chars in the length of the host part. seen_host_delim ? port_len++ : host_len++; c++; } } if (0 == host_len) { fprintf(stderr,"no host found\n"); return NULL; } if (seen_host_delim && (0 == port_len)) { fprintf(stderr,"port delimiter seen but no port number string\n"); return NULL; } host_port_t *host_port = (host_port_t *) malloc(sizeof(host_port_t)); if (NULL == host_port) { fprintf(stderr,"cannot allocate host_port\n"); return NULL; } bzero((void *) host_port,sizeof(host_port_t)); host_port->host = strndup(host_start,host_len); if (NULL == host_port->host) { fprintf(stderr,"cannot allocate host_port->host\n"); free_host_port_t(host_port); return NULL; } host_port->port = 0; if ((seen_host_delim) && (port_len > 0)) { char *port_chars = strndup(port_start,port_len); if (NULL == port_chars) { fprintf(stderr,"cannot allocate chars for host_port->port\n"); free_host_port_t(host_port); return NULL; } long port_long = (long) strtol(port_chars,NULL,10); if (0 == port_long) { fprintf(stderr,"zero port or cannot convert port_chars %s to long\n",port_chars); free_host_port_t(host_port); free(port_chars); return NULL; } free(port_chars); if (MAX_PORT < port_long) { fprintf(stderr,"port %ld out of range\n",port_long); free_host_port_t(host_port); *host_port_out_err = UPARSE_ERROR; return NULL; } host_port->port = (unsigned int) port_long; } *s = c; // set pointer past host/port *host_port_out_err = NO_UPARSE_ERROR; return host_port; }
bool net::uri::uri::normalize(uri& other) const { if (!other._M_uri.allocate(_M_uri.length() + 1)) { return false; } const char* d = _M_uri.data(); const char* end = d + _M_uri.length(); char* otherd = other._M_uri.data(); // Copy scheme. size_t len = _M_scheme.length(); for (size_t i = 0; i < len; i++) { *otherd++ = util::to_lower(*d++); } other._M_scheme.set(other._M_uri.data(), len); // Skip colon. d++; *otherd++ = ':'; // If there is authority... if (_M_hier_part.host.length() > 0) { d += 2; *otherd++ = '/'; *otherd++ = '/'; // If there is user information... if ((len = _M_hier_part.userinfo.length()) > 0) { // Save start of the userinfo. char* userinfo = otherd; size_t i = 0; do { if (*d == '%') { uint8_t c = (util::hex2dec(d[1]) * 16) + util::hex2dec(d[2]); if (is_unreserved(c)) { *otherd++ = static_cast<char>(c); } else { *otherd++ = '%'; *otherd++ = util::to_upper(d[1]); *otherd++ = util::to_upper(d[2]); } d += 3; i += 2; } else { *otherd++ = *d++; } } while (++i < len); other._M_hier_part.userinfo.set(userinfo, otherd - userinfo); d++; *otherd++ = '@'; } // IP literal? if (_M_hier_part.ip_literal) { // Skip '['. d++; *otherd++ = '['; } // Save start of the host. char* host = otherd; // Copy host. len = _M_hier_part.host.length(); size_t i = 0; do { if (*d == '%') { uint8_t c = (util::hex2dec(d[1]) * 16) + util::hex2dec(d[2]); if (is_unreserved(c)) { *otherd++ = static_cast<char>(util::to_lower(c)); } else { *otherd++ = '%'; *otherd++ = util::to_upper(d[1]); *otherd++ = util::to_upper(d[2]); } d += 3; i += 2; } else { *otherd++ = util::to_lower(*d++); } } while (++i < len); other._M_hier_part.host.set(host, otherd - host); // IP literal? if (_M_hier_part.ip_literal) { // Skip ']'. d++; *otherd++ = ']'; } other._M_hier_part.ip_literal = _M_hier_part.ip_literal; // If not a standard port... if (_M_hier_part.port != 0) { *otherd++ = ':'; const char* otherend = other._M_uri.data() + other._M_uri.capacity(); otherd += snprintf(otherd, otherend - otherd, "%u", _M_hier_part.port); } other._M_hier_part.port = _M_hier_part.port; } // Save start of the path. char* path = otherd; // If the path is empty... if ((len = _M_hier_part.path.length()) == 0) { *otherd++ = '/'; } else { // Remove dot segments. d = _M_hier_part.path.data(); size_t i = 0; do { if (*d == '%') { uint8_t c = (util::hex2dec(d[1]) * 16) + util::hex2dec(d[2]); if (is_unreserved(c)) { *otherd++ = static_cast<char>(c); } else { *otherd++ = '%'; *otherd++ = util::to_upper(d[1]); *otherd++ = util::to_upper(d[2]); } d += 3; i += 2; } else { *otherd++ = *d++; } // A. If the input buffer begins with a prefix of "../" or "./", // then remove that prefix from the input buffer; otherwise, if (((path + 3 == otherd) && (path[0] == '.') && (path[1] == '.') && (path[2] == '/')) || ((path + 2 == otherd) && (path[0] == '.') && (path[1] == '/'))) { otherd = path; // B. if the input buffer begins with a prefix of "/./" or "/.", // where "." is a complete path segment, then replace that // prefix with "/" in the input buffer; otherwise, } else if ((path + 3 <= otherd) && (otherd[-3] == '/') && (otherd[-2] == '.') && (otherd[-1] == '/')) { otherd -= 2; } else if ((path + 2 <= otherd) && (d == end) && (otherd[-2] == '/') && (otherd[-1] == '.')) { otherd--; // C. if the input buffer begins with a prefix of "/../" or "/..", // where ".." is a complete path segment, then replace that // prefix with "/" in the input buffer and remove the last // segment and its preceding "/" (if any) from the output // buffer; otherwise, } else if ((path + 4 <= otherd) && (otherd[-4] == '/') && (otherd[-3] == '.') && (otherd[-2] == '.') && (otherd[-1] == '/')) { otherd -= 4; while (otherd > path) { if (*--otherd == '/') { break; } } *otherd++ = '/'; } else if ((path + 3 <= otherd) && (d == end) && (otherd[-3] == '/') && (otherd[-2] == '.') && (otherd[-1] == '.')) { otherd -= 3; while (otherd > path) { if (*--otherd == '/') { break; } } *otherd++ = '/'; // D. if the input buffer consists only of "." or "..", then remove // that from the input buffer; otherwise, } else if (((path + 1 == otherd) && (d == end) && (otherd[-1] == '.')) || ((path + 2 == otherd) && (d == end) && (otherd[-2] == '.') && (otherd[-1] == '.'))) { otherd = path; } } while (++i < len); } if (path < otherd) { other._M_hier_part.path.set(path, otherd - path); } // If there is query... if ((len = _M_query.length()) > 0) { *otherd++ = '?'; // Save start of the query. char* query = otherd; d = _M_query.data(); size_t i = 0; do { if (*d == '%') { uint8_t c = (util::hex2dec(d[1]) * 16) + util::hex2dec(d[2]); if (is_unreserved(c)) { *otherd++ = static_cast<char>(c); } else { *otherd++ = '%'; *otherd++ = util::to_upper(d[1]); *otherd++ = util::to_upper(d[2]); } d += 3; i += 2; } else { *otherd++ = *d++; } } while (++i < len); other._M_query.set(query, otherd - query); } // If there is fragment... if ((len = _M_fragment.length()) > 0) { *otherd++ = '#'; // Save start of the fragment. char* fragment = otherd; d = _M_fragment.data(); size_t i = 0; do { if (*d == '%') { uint8_t c = (util::hex2dec(d[1]) * 16) + util::hex2dec(d[2]); if (is_unreserved(c)) { *otherd++ = static_cast<char>(c); } else { *otherd++ = '%'; *otherd++ = util::to_upper(d[1]); *otherd++ = util::to_upper(d[2]); } d += 3; i += 2; } else { *otherd++ = *d++; } } while (++i < len); other._M_fragment.set(fragment, otherd - fragment); } other._M_uri.length(otherd - other._M_uri.data()); return true; }
/** Decodes URL-encoded data. Because encoded data is always bigger conversion is done in-place. @return Number of decoded bytes written to data. Negative integer if data is not valid URL-encoded sequence. */ ssize_t urldecode(decoder_state *state, char *data, size_t size) { size_t inpos = 0, outpos = 0; int d1, d2; while (inpos < size) { char in = data[inpos++]; switch (in) { case '%': switch (state->state) { case ST_SYM: state->state = ST_PERCENT; break; default: return -1; } break; case '+': switch (state->state) { case ST_SYM: data[outpos++] = ' '; break; default: return -1; } break; default: switch (state->state) { case ST_PERCENT_AND_SYM: d1 = hexdigit(state->sym), d2 = hexdigit(in); if (d1 >= 0 && d2 >= 0) { data[outpos++] = (d1 << 4) | d2; } else { return -1; } state->state = ST_SYM; break; case ST_PERCENT: state->sym = in; state->state = ST_PERCENT_AND_SYM; break; case ST_SYM: if (is_unreserved(in)) { data[outpos++] = in; } else { return -1; } break; } } } return outpos; }
/** Built-in preprocessing callback * * Built-in preprocessing callback to break or not to break URLs according to * some rules by Chicago Manual of Style 15th ed. * If data is NULL, prohibit break. * Otherwise, allow break by rule above. */ gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data, unistr_t * str, unistr_t * text) { gcstring_t *gcstr; size_t i; unichar_t *ptr; /* Pass I */ if (text != NULL) { /* * Search URL in str. * Following code loosely refers RFC3986 but some practical * assumptions are put: * * o Broken pct-encoded sequences (e.g. single "%") are allowed. * o scheme names must end with alphanumeric, must be longer than * or equal to two octets, and must not contain more than one * non-alphanumeric ("+", "-" or "."). * o URLs containing neither non-empty path, query part nor fragment * (e.g. "about:") are omitted: they are treated as ordinal words. */ for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) { int has_double_slash, has_authority, has_empty_path, has_no_query, has_no_fragment; size_t alphadigit, nonalphadigit; /* skip non-alpha. */ if (!is_alpha(str, i)) continue; ptr = str->str + i; /* "url:" - case insensitive */ if (startswith(str, i, "url:", 4, 0)) i += 4; /* scheme */ if (is_alpha(str, i)) i++; else continue; nonalphadigit = 0; alphadigit = 1; while (1) { if (is_alpha(str, i) || is_digit(str, i)) alphadigit++; else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.')) nonalphadigit++; else break; i++; } if (alphadigit < 2 || 1 < nonalphadigit || ! (is_digit(str, i - 1) || is_alpha(str, i - 1))) continue; /* ":" */ if (is(str, i, ':')) i++; else continue; /* hier-part */ has_double_slash = 0; has_authority = 0; has_empty_path = 0; has_no_query = 0; has_no_fragment = 0; if (startswith(str, i, "//", 2, 0)) { /* "//" */ has_double_slash = 1; i += 2; /* authority - FIXME:syntax relaxed */ if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') || is_unreserved(str, i) || is_pct_encoded(str, i) || is_sub_delim(str, i)) { has_authority = 1; i++; while (is(str, i, '[') || is(str, i, ']') || is(str, i, ':') || is(str, i, '@') || is_unreserved(str, i) || is_pct_encoded(str, i) || is_sub_delim(str, i)) i++; } } /* path */ if (has_double_slash) { if (has_authority) goto path_abempty; else goto path_absolute; } /* else goto path_rootless; */ /* path_rootless: */ if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */ i++; while (is_pchar(str, i)) i++; goto path_abempty; } else { has_empty_path = 1; goto path_empty; } path_absolute: if (startswith(str, i, "//", 2, 0)) continue; else if (is(str, i, '/')) { i++; if (is_pchar(str, i)) { i++; while (is_pchar(str, i)) i++; } goto path_abempty; } else continue; path_abempty: if (is(str, i, '/')) { i++; while (is(str, i, '/') || is_pchar(str, i)) i++; } /* else goto path_empty; */ path_empty: ; /* query */ if (is(str, i, '?')) { i++; while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i)) i++; } else has_no_query = 1; /* fragment */ if (is(str, i, '#')) { i++; while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i)) i++; } else has_no_fragment = 1; if (has_empty_path && has_no_query && has_no_fragment) continue; break; } if (ptr != NULL) str->len = i - (ptr - str->str); str->str = ptr; return NULL; } /* Pass II */ if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) { lbobj->errnum = errno ? errno : ENOMEM; return NULL; } /* non-break URI. */ if (data == NULL) { for (i = 1; i < gcstr->gclen; i++) gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; return gcstr; } /* break URI. */ if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) { gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE; i = 5; } else i = 1; for (; i < gcstr->gclen; i++) { unichar_t u, v; u = gcstr->str[gcstr->gcstr[i - 1].idx]; v = gcstr->str[gcstr->gcstr[i].idx]; /* * Some rules based on CMoS 15th ed. * 17.11 1.1: [/] ÷ [^/] * 17.11 2: [-] × * 6.17 2: [.] × * 17.11 1.2: ÷ [-~.,_?#%] * 17.11 1.3: ÷ [=&] * 17.11 1.3: [=&] ÷ * Default: ALL × ALL */ if (u == '/' && v != '/') gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE; else if (u == '-' || u == '.') gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; else if (v == '-' || v == '~' || v == '.' || v == ',' || v == '_' || v == '?' || v == '#' || v == '%' || u == '=' || v == '=' || u == '&' || v == '&') gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE; else gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; } /* Won't break punctuations at end of matches. */ for (i = gcstr->gclen - 1; 1 <= i; i--) { unichar_t u = gcstr->str[gcstr->gcstr[i].idx]; if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE && (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' || u == '>')) gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE; else break; } return gcstr; }