static gunichar * _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode) { gsize n_wc; gunichar *wc_buffer; const char *p; gsize last_start; gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); n_wc = 0; p = str; while ((max_len < 0 || p < str + max_len) && *p) { const gchar *decomp; gunichar wc = g_utf8_get_char (p); if (wc >= 0xac00 && wc <= 0xd7af) { gsize result_len; decompose_hangul (wc, NULL, &result_len); n_wc += result_len; } else { decomp = find_decomposition (wc, do_compat); if (decomp) n_wc += g_utf8_strlen (decomp, -1); else n_wc++; } p = g_utf8_next_char (p); } wc_buffer = g_new (gunichar, n_wc + 1); last_start = 0; n_wc = 0; p = str; while ((max_len < 0 || p < str + max_len) && *p) { gunichar wc = g_utf8_get_char (p); const gchar *decomp; int cc; gsize old_n_wc = n_wc; if (wc >= 0xac00 && wc <= 0xd7af) { gsize result_len; decompose_hangul (wc, wc_buffer + n_wc, &result_len); n_wc += result_len; } else { decomp = find_decomposition (wc, do_compat); if (decomp) { const char *pd; for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) wc_buffer[n_wc++] = g_utf8_get_char (pd); } else wc_buffer[n_wc++] = wc; } if (n_wc > 0) { cc = COMBINING_CLASS (wc_buffer[old_n_wc]); if (cc == 0) { g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); last_start = old_n_wc; } } p = g_utf8_next_char (p); } if (n_wc > 0) { g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); last_start = n_wc; } wc_buffer[n_wc] = 0; /* All decomposed and reordered */ if (do_compose && n_wc > 0) { gsize i, j; int last_cc = 0; last_start = 0; for (i = 0; i < n_wc; i++) { int cc = COMBINING_CLASS (wc_buffer[i]); if (i > 0 && (last_cc == 0 || last_cc != cc) && combine (wc_buffer[last_start], wc_buffer[i], &wc_buffer[last_start])) { for (j = i + 1; j < n_wc; j++) wc_buffer[j - 1] = wc_buffer[j]; n_wc--; i--; if (i == last_start) last_cc = 0; else last_cc = COMBINING_CLASS (wc_buffer[i - 1]); continue; } if (cc == 0) last_start = i; last_cc = cc; } } wc_buffer[n_wc] = 0; return wc_buffer; }
/* {{{1 * Normalize (compose/decompose) characters in ‘str˚ so that strings that * actually contain the same characters will be recognized as equal for * comparison for example. */ unichar * _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode) { bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD); bool do_compose = (mode = NORMALIZE_NFC || mode == NORMALIZE_NFKC); size_t n = 0; const char *p = str; while ((!use_len || p < str + max_len) && *p != NUL) { unichar c = utf_char(p); if (c >= 0xac00 && c <= 0xd7af) { size_t len; decompose_hangul(c, NULL, &len); n += len; } else { const char *decomp = find_decomposition(c, do_compat); n += (decomp != NULL) ? utf_length(decomp) : 1; } p = utf_next(p); } unichar *buf = ALLOC_N(unichar, n + 1); size_t prev_start; for (p = str, prev_start = 0, n = 0; (!use_len || p < str + max_len) && *p != NUL; p = utf_next(p)) { unichar c = utf_char(p); size_t prev_n = n; if (c >= 0xac00 && c <= 0xd7af) { size_t len; decompose_hangul(c, buf + n, &len); n += len; } else { const char *decomp = find_decomposition(c, do_compat); if (decomp != NULL) { for ( ; *decomp != NUL; decomp = utf_next(decomp)) buf[n++] = utf_char(decomp); } else { buf[n++] = c; } } if (n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) { unicode_canonical_ordering(buf + prev_start, n - prev_start); prev_start = prev_n; } } if (n > 0) { unicode_canonical_ordering(buf + prev_start, n - prev_start); prev_start = n; } buf[n] = NUL; /* done with decomposition and reordering */ if (do_compose && n > 0) { prev_start = 0; int prev_cc = 0; for (size_t i = 0; i < n; i++) { int cc = COMBINING_CLASS(buf[i]); if (i > 0 && (prev_cc == 0 || prev_cc != cc) && combine(buf[prev_start], buf[i], &buf[prev_start])) { for (size_t j = i + 1; j < n; j++) buf[j - 1] = buf[j]; n--; i--; prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]); } else { if (cc == 0) prev_start = i; prev_cc = cc; } } buf[n] = NUL; } return buf; }