Beispiel #1
0
void shorten_name(const char *name,
		char short_name[512], char shortest_name[512])
{
    wchar_t w_name[512];
    wchar_t w_short_name[512];
    wchar_t w_shortest_name[512];

    const wchar_t *cur_word, *wchar_ptr;
    wchar_t *cur_short_word, *cur_shortest_word;

    int unabbrev = 0;
    int i, len, new_len, capital;

    if (!name)
        return;

    mbsrtowcs_l(w_name, &name, ARRAY_SIZE(w_name), NULL, l);

    /* TODO: also skip anything in parenthesis from the short names */

    /* TODO: instead of calling wcscasecmp_l all the time it'd be more
     * effective to lower case w_name once and use plain wcscmp or mem
     * compare since the phrases we search for are all lower case already.
     * Might also want to take "collation" into account (wcsxfrm_l the
     * string once and use memcmp instead of wcscoll_l).
     */

    cur_word = w_name;
    cur_short_word = w_short_name;
    cur_shortest_word = w_shortest_name;
    while (1) {
        while (*cur_word && !iswalnum_l(*cur_word, l))
	    *cur_short_word ++ = *cur_shortest_word ++ = *cur_word ++;

	if (!*cur_word)
	    break;

        /* TODO: use a hash of some kind instead of iterating over arrays */

        /* Go through possible abbreviations from top to bottom */
        for (i = 0; i < ARRAY_SIZE(abbrevs); i += 2)
	    if (!wcsncasecmp_l(abbrevs[i], cur_word, wcslen(abbrevs[i]), l)) {
                len = wcslen(abbrevs[i]);

	        /* Check that we matched a full word */
                if (iswalnum_l(cur_word[len], l))
		    continue;

		capital = iswupper_l(*cur_word, l);
		cur_word += len;

		new_len = wcslen(abbrevs[i + 1]);
		memcpy(cur_short_word, abbrevs[i + 1],
		        new_len * sizeof(wchar_t));
		/*
		 * If original was capitalised then capitalise the abbreviation
		 * as well, if it was lower case.
		 */
		if (capital)
		    *cur_short_word = towupper_l(*cur_short_word, l);

		/* Make sure shortest_word doesn't end up being empty */
		if (!*cur_word && !unabbrev) {
		    memcpy(cur_shortest_word, cur_short_word,
		            new_len * sizeof(wchar_t));
		    cur_shortest_word += new_len;
		}

		cur_short_word += new_len;

		/*
		 * Avoid excess whitespace in short and shortest
		 * when a word is replaced with "".
		 * TODO: this may require more complicated logic to get
		 * the corner cases right.
		 */
		if (new_len == 0) {
		    if (cur_short_word > w_short_name &&
		            iswspace_l(cur_short_word[-1], l))
			cur_short_word --;
		    if (cur_shortest_word > w_shortest_name &&
		            iswspace_l(cur_shortest_word[-1], l))
			cur_shortest_word --;
	        }

                /*if (new_len != len)*/
		break;
	    }
        if (i < ARRAY_SIZE(abbrevs))
	    continue;

        /* Go through possible given names from top to bottom */
        for (i = 0; i < ARRAY_SIZE(given_names); i ++)
	    if (!wcsncasecmp_l(given_names[i], cur_word,
	            wcslen(given_names[i]), l)) {
                len = wcslen(given_names[i]);

	        /* Check that we matched a full word */
                if (iswalnum_l(cur_word[len], l))
		    continue;

		/*
		 * If this is the final part of the name, and it matches a
		 * given name then that's most likely somebody's surname which
		 * happens to also be a possibble given name.  In that case
		 * do not abbreviate or omit it.
		 */
		if (!cur_word[len])
		    continue;

		cur_word += len;

		*cur_short_word++ = given_names[i][0];
		*cur_short_word++ = L'.';

		/*
		 * Avoid excess whitespace in shortest when a word is
		 * replaced with "".
		 * TODO: this may require more complicated logic to get
		 * the corner cases right.
		 */
		if (cur_shortest_word > w_shortest_name &&
		        iswspace_l(cur_shortest_word[-1], l))
		    cur_shortest_word --;

		break;
	    }
        if (i < ARRAY_SIZE(given_names))
	    continue;

        /* Nothing matched, copy the current word as-is */
        while (iswalnum_l(*cur_word, l))
	    *cur_short_word ++ = *cur_shortest_word ++ = *cur_word ++;
	unabbrev += 1;
    }

    *cur_short_word = 0;
    *cur_shortest_word = 0;

    wchar_ptr = w_short_name;
    wcsrtombs_l(short_name, &wchar_ptr, 512, NULL, l);
    wchar_ptr = w_shortest_name;
    wcsrtombs_l(shortest_name, &wchar_ptr, 512, NULL, l);
}
Beispiel #2
0
/*
 * In order to properly handle multibyte locales, its easiest to just
 * convert to wide characters and then use wcscoll.  However if an
 * error occurs, we gracefully fall back to simple strcmp.  Caller
 * should check errno.
 */
int
strcoll_l(const char *s, const char *s2, locale_t locale)
{
	int ret;
	wchar_t *t1 = NULL, *t2 = NULL;
	wchar_t *w1 = NULL, *w2 = NULL;
	const char *cs1, *cs2;
	mbstate_t mbs1;
	mbstate_t mbs2;
	size_t sz1, sz2;

	memset(&mbs1, 0, sizeof (mbstate_t));
	memset(&mbs2, 0, sizeof (mbstate_t));

	/*
	 * The mbsrtowcs_l function can set the src pointer to null upon
	 * failure, so it should act on a copy to avoid:
	 *   - sending null pointer to strcmp
	 *   - having strcoll/strcoll_l change *s or *s2 to null
	 */
	cs1 = s;
	cs2 = s2;

	FIX_LOCALE(locale);
	struct xlocale_collate *table =
		(struct xlocale_collate*)locale->components[XLC_COLLATE];

	if (table->__collate_load_error)
		goto error;

	sz1 = strlen(s) + 1;
	sz2 = strlen(s2) + 1;

	/*
	 * Simple assumption: conversion to wide format is strictly
	 * reducing, i.e. a single byte (or multibyte character)
	 * cannot result in multiple wide characters.
	 */
	if ((t1 = malloc(sz1 * sizeof (wchar_t))) == NULL)
		goto error;
	w1 = t1;
	if ((t2 = malloc(sz2 * sizeof (wchar_t))) == NULL)
		goto error;
	w2 = t2;

	if ((mbsrtowcs_l(w1, &cs1, sz1, &mbs1, locale)) == (size_t)-1)
		goto error;

	if ((mbsrtowcs_l(w2, &cs2, sz2, &mbs2, locale)) == (size_t)-1)
		goto error;

	ret = wcscoll_l(w1, w2, locale);
	free(t1);
	free(t2);

	return (ret);

error:
	free(t1);
	free(t2);
	return (strcmp(s, s2));
}