Beispiel #1
0
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  is_reserved
 *  Description:  Reserved characters can only be used in certain parts of the URI
 *                string.
 * =====================================================================================
 */
static bool
is_reserved( char c )
{
	bool rv = false;
	if( is_gen_delim(c) || is_sub_delim(c) ) {
		rv = true;
	}
	return rv;
}
Beispiel #2
0
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  is_pchar
 *  Description:  Test if a character is a pchar as desribed by RFC3986.  A pchar
 *                is: unreserved / pct-encoded / sub-delims / ":" / "@", if character
 *                is a pchar return true otherwise return false.
 * =====================================================================================
 */
static bool
is_pchar( char c)
{
	bool result  = false;
	if( is_unreserved(c) || is_sub_delim(c) ){
		result = true;
	}
	else if( c == ':' || c == '@'){
		result = true;
	}
	return result;
}
Beispiel #3
0
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  uri_norm_host
 *  Description:  Normalize the host section of the URI if it is set.  The host section
 *                that is normalized here is the reg-name, and not the IP address.
 *                The IP address should be inside the IP section of the structure.
 *
 *                This section attempts to implement section 3.2.2 of RFC3986 with the
 *                exception that userinfo is not checked as HTTP and HTTPS do not 
 *                specify the usage of it.
 * =====================================================================================
 */
extern int
uri_norm_host( uriobj_t *uri)
{
	int   err  = 0, i, len;
	char *pct  = (char *) malloc(4),
	     *host = URI_CP_PT(uri->uri_host);
	if( host ) {
		len = strlen(host);
		for(i = 0;i < len; i ++ ) {
			if( host[i] == '%'){
				if( (i + 2) > len ) {
					err = EILSEQ;
					break;
				}
				pct[0] = host[i ++];
				pct[1] = host[i ++];
				pct[2] = host[i ++];
				pct[3] = '\0';
				err = norm_pct(&pct);
				if( err ) {
					break;
				}
				host[(i - 3)] = pct[0];
				host[(i - 2)] = pct[1];
				host[(i - 1)] = pct[2];
				continue;
			}
			if(!(is_sub_delim(host[i]) || is_unreserved(host[i]))){
				err = EILSEQ;
				break;	
			}
			if( isalpha(host[i]) && isupper(host[i]) ){
				host[i] = tolower(host[i]);
			}
		}
		if( ! err ) {
			*(uri->uri_host) = NULL;
			free(*uri->uri_host);
			*(uri->uri_host) = host;
		}
	}
	return err;
}
Beispiel #4
0
/** Built-in preprocessing callback
 *
 * Built-in preprocessing callback to break or not to break URLs according to
 * some rules by Chicago Manual of Style 15th ed.
 * If data is NULL, prohibit break.
 * Otherwise, allow break by rule above.
 */
gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data,
				    unistr_t * str, unistr_t * text)
{
    gcstring_t *gcstr;
    size_t i;
    unichar_t *ptr;

    /* Pass I */

    if (text != NULL) {
	/*
	 * Search URL in str.
	 * Following code loosely refers RFC3986 but some practical
	 * assumptions are put:
	 *
	 * o Broken pct-encoded sequences (e.g. single "%") are allowed.
	 * o scheme names must end with alphanumeric, must be longer than
	 *   or equal to two octets, and must not contain more than one
	 *   non-alphanumeric ("+", "-" or ".").
	 * o URLs containing neither non-empty path, query part nor fragment
	 *   (e.g. "about:") are omitted: they are treated as ordinal words.
	 */
	for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) {
	    int has_double_slash, has_authority, has_empty_path,
		has_no_query, has_no_fragment;
	    size_t alphadigit, nonalphadigit;

	    /* skip non-alpha. */
	    if (!is_alpha(str, i))
		continue;

	    ptr = str->str + i;

	    /* "url:" - case insensitive */
	    if (startswith(str, i, "url:", 4, 0))
		i += 4;

	    /* scheme */
	    if (is_alpha(str, i))
		i++;
	    else
		continue;

	    nonalphadigit = 0;
	    alphadigit = 1;
	    while (1) {
		if (is_alpha(str, i) || is_digit(str, i))
		    alphadigit++;
		else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.'))
		    nonalphadigit++;
		else
		    break;
		i++;
	    }
	    if (alphadigit < 2 || 1 < nonalphadigit ||
	        ! (is_digit(str, i - 1) || is_alpha(str, i - 1)))
		continue;

	    /* ":" */
	    if (is(str, i, ':'))
		i++;
	    else
		continue;

	    /* hier-part */
	    has_double_slash = 0;
	    has_authority = 0;
	    has_empty_path = 0;
	    has_no_query = 0;
	    has_no_fragment = 0;
	    if (startswith(str, i, "//", 2, 0)) {
		/* "//" */
		has_double_slash = 1;
		i += 2;

		/* authority - FIXME:syntax relaxed */
		if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') ||
		    is_unreserved(str, i) || is_pct_encoded(str, i) ||
		    is_sub_delim(str, i)) {
		    has_authority = 1;
		    i++;
		    while (is(str, i, '[') || is(str, i, ']') ||
			   is(str, i, ':') || is(str, i, '@') ||
			   is_unreserved(str, i) || is_pct_encoded(str, i) ||
			   is_sub_delim(str, i))
			i++;
		}
	    }

	    /* path */
	    if (has_double_slash) {
		if (has_authority)
		    goto path_abempty;
		else
		    goto path_absolute;
	    } /* else goto path_rootless; */

	    /* path_rootless: */
	    if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */
		i++;
		while (is_pchar(str, i))
		    i++;
		goto path_abempty;
	    } else {
		has_empty_path = 1;
		goto path_empty;
	    }

	  path_absolute:
	    if (startswith(str, i, "//", 2, 0))
		continue;
	    else if (is(str, i, '/')) {
		i++;
		if (is_pchar(str, i)) {
		    i++;
		    while (is_pchar(str, i))
			i++;
		}
		goto path_abempty;
	    } else
		continue;

	  path_abempty:
	    if (is(str, i, '/')) {
		i++;
		while (is(str, i, '/') || is_pchar(str, i))
		    i++;
	    } /* else goto path_empty; */

	  path_empty:
	    ;

	    /* query */
	    if (is(str, i, '?')) {
		i++;
		while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
		    i++;
	    } else
		has_no_query = 1;

	    /* fragment */
	    if (is(str, i, '#')) {
		i++;
		while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
		    i++;
	    } else
		has_no_fragment = 1;

	    if (has_empty_path && has_no_query && has_no_fragment)
		continue;

	    break;
	}

	if (ptr != NULL)
	    str->len = i - (ptr - str->str);
	str->str = ptr;
	return NULL;
    }

    /* Pass II */

    if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) {
	lbobj->errnum = errno ? errno : ENOMEM;
	return NULL;
    }

    /* non-break URI. */
    if (data == NULL) {
	for (i = 1; i < gcstr->gclen; i++)
	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
	return gcstr;
    }

    /* break URI. */
    if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) {
	gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
	i = 5;
    } else
	i = 1;
    for (; i < gcstr->gclen; i++) {
	unichar_t u, v;
	u = gcstr->str[gcstr->gcstr[i - 1].idx];
	v = gcstr->str[gcstr->gcstr[i].idx];

	/*
	 * Some rules based on CMoS 15th ed.
	 * 17.11 1.1: [/] ÷ [^/]
	 * 17.11 2:   [-] ×
	 * 6.17 2:   [.] ×
	 * 17.11 1.2: ÷ [-~.,_?#%]
	 * 17.11 1.3: ÷ [=&]
	 * 17.11 1.3: [=&] ÷
	 * Default:  ALL × ALL
	 */
	if (u == '/' && v != '/')
	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
	else if (u == '-' || u == '.')
	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
	else if (v == '-' || v == '~' || v == '.' || v == ',' ||
		 v == '_' || v == '?' || v == '#' || v == '%' ||
		 u == '=' || v == '=' || u == '&' || v == '&')
	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
	else
	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
    }

    /* Won't break punctuations at end of matches. */
    for (i = gcstr->gclen - 1; 1 <= i; i--) {
	unichar_t u = gcstr->str[gcstr->gcstr[i].idx];
	if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE &&
	    (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' ||
	     u == '>'))
	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
	else
	    break;
    }
    return gcstr;
}