/** * normalize_terms: * @terms: (element-type utf8): Input search terms * * Returns: (element-type utf8) (transfer full): Unicode-normalized and lowercased terms */ static GSList * normalize_terms (GSList *terms) { GSList *normalized_terms = NULL; GSList *iter; for (iter = terms; iter; iter = iter->next) { const char *term = iter->data; normalized_terms = g_slist_prepend (normalized_terms, shell_util_normalize_and_casefold (term)); } return normalized_terms; }
/* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under the GPL * Originally written by Aleksander Morgado <*****@*****.**> */ char * shell_util_normalize_casefold_and_unaccent (const char *str) { char *tmp; gsize i = 0, j = 0, ilen; if (str == NULL) return NULL; /* Get the NFKD-normalized and casefolded string */ tmp = shell_util_normalize_and_casefold (str); ilen = strlen (tmp); while (i < ilen) { gunichar unichar; gchar *next_utf8; gint utf8_len; /* Get next character of the word as UCS4 */ unichar = g_utf8_get_char_validated (&tmp[i], -1); /* Invalid UTF-8 character or end of original string. */ if (unichar == (gunichar) -1 || unichar == (gunichar) -2) { break; } /* Find next UTF-8 character */ next_utf8 = g_utf8_next_char (&tmp[i]); utf8_len = next_utf8 - &tmp[i]; if (IS_CDM_UCS4 ((guint32) unichar)) { /* If the given unichar is a combining diacritical mark, * just update the original index, not the output one */ i += utf8_len; continue; } /* If already found a previous combining * diacritical mark, indexes are different so * need to copy characters. As output and input * buffers may overlap, need to use memmove * instead of memcpy */ if (i != j) { memmove (&tmp[j], &tmp[i], utf8_len); } /* Update both indexes */ i += utf8_len; j += utf8_len; } /* Force proper string end */ tmp[j] = '\0'; return tmp; }