/* The input word in this method MUST be normalized in NFKD form, * and given in UTF-8, where str_length is the byte-length */ gboolean tracker_parser_unaccent_nfkd_string (gpointer str, gsize *str_length) { gchar *word; gsize word_length; gsize i; gsize j; g_return_val_if_fail (str != NULL, FALSE); g_return_val_if_fail (str_length != NULL, FALSE); g_return_val_if_fail (*str_length > 0, FALSE); word = (gchar *)str; word_length = *str_length; i = 0; j = 0; while (i < word_length) { ucs4_t unichar; gint utf8_len; /* Get next character of the word as UCS4 */ utf8_len = u8_strmbtouc (&unichar, &word[i]); /* Invalid UTF-8 character or end of original string. */ if (utf8_len <= 0) { break; } /* If the given unichar is a combining diacritical mark, * just update the original index, not the output one */ if (IS_CDM_UCS4 ((guint32) unichar)) { i += utf8_len; continue; } /* If already found a previous combining * diacritical mark, indexes are different so * need to copy characters. As output and input * buffers may overlap, need to use memmove * instead of memcpy */ if (i != j) { memmove (&word[j], &word[i], utf8_len); } /* Update both indexes */ i += utf8_len; j += utf8_len; } /* Force proper string end */ word[j] = '\0'; /* Set new output length */ *str_length = j; return TRUE; }
/* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under the GPL * And then from gnome-shell/src/shell-util.c * * Originally written by Aleksander Morgado <*****@*****.**> */ char * cc_util_normalize_casefold_and_unaccent (const char *str) { char *normalized, *tmp; int i = 0, j = 0, ilen; if (str == NULL) return NULL; normalized = g_utf8_normalize (str, -1, G_NORMALIZE_NFKD); tmp = g_utf8_casefold (normalized, -1); g_free (normalized); ilen = strlen (tmp); while (i < ilen) { gunichar unichar; gchar *next_utf8; gint utf8_len; /* Get next character of the word as UCS4 */ unichar = g_utf8_get_char_validated (&tmp[i], -1); /* Invalid UTF-8 character or end of original string. */ if (unichar == (gunichar) -1 || unichar == (gunichar) -2) { break; } /* Find next UTF-8 character */ next_utf8 = g_utf8_next_char (&tmp[i]); utf8_len = next_utf8 - &tmp[i]; if (IS_CDM_UCS4 ((guint32) unichar)) { /* If the given unichar is a combining diacritical mark, * just update the original index, not the output one */ i += utf8_len; continue; } /* If already found a previous combining * diacritical mark, indexes are different so * need to copy characters. As output and input * buffers may overlap, need to use memmove * instead of memcpy */ if (i != j) { memmove (&tmp[j], &tmp[i], utf8_len); } /* Update both indexes */ i += utf8_len; j += utf8_len; } /* Force proper string end */ tmp[j] = '\0'; return tmp; }