/* {{{1 * Rearrange ‘str’ so that decomposed characters are arranged according to * their combining class. Do this for at most ‘len’ bytes of data. */ void unicode_canonical_ordering(unichar *str, size_t len) { bool swapped = true; while (swapped) { swapped = false; int prev = COMBINING_CLASS(str[0]); for (size_t i = 0; i < len - 1; i++) { int next = COMBINING_CLASS(str[i + 1]); if (next != 0 && prev > next) { for (size_t j = i + 1; j > 0 && COMBINING_CLASS(str[j - 1]) <= next; j--) { unichar c = str[j]; str[j] = str[j - 1]; str[j - 1] = c; swapped = true; } next = prev; } prev = next; } } }
/** * g_unicode_canonical_ordering: * @string: a UCS-4 encoded string. * @len: the maximum length of @string to use. * * Computes the canonical ordering of a string in-place. * This rearranges decomposed characters in the string * according to their combining classes. See the Unicode * manual for more information. **/ void g_unicode_canonical_ordering (gunichar *string, gsize len) { gsize i; int swap = 1; while (swap) { int last; swap = 0; last = COMBINING_CLASS (string[0]); for (i = 0; i < len - 1; ++i) { int next = COMBINING_CLASS (string[i + 1]); if (next != 0 && last > next) { gsize j; /* Percolate item leftward through string. */ for (j = i + 1; j > 0; --j) { gunichar t; if (COMBINING_CLASS (string[j - 1]) <= next) break; t = string[j]; string[j] = string[j - 1]; string[j - 1] = t; swap = 1; } /* We're re-entering the loop looking at the old character again. */ next = last; } last = next; } } }
static gunichar * _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode) { gsize n_wc; gunichar *wc_buffer; const char *p; gsize last_start; gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); n_wc = 0; p = str; while ((max_len < 0 || p < str + max_len) && *p) { const gchar *decomp; gunichar wc = g_utf8_get_char (p); if (wc >= 0xac00 && wc <= 0xd7af) { gsize result_len; decompose_hangul (wc, NULL, &result_len); n_wc += result_len; } else { decomp = find_decomposition (wc, do_compat); if (decomp) n_wc += g_utf8_strlen (decomp, -1); else n_wc++; } p = g_utf8_next_char (p); } wc_buffer = g_new (gunichar, n_wc + 1); last_start = 0; n_wc = 0; p = str; while ((max_len < 0 || p < str + max_len) && *p) { gunichar wc = g_utf8_get_char (p); const gchar *decomp; int cc; gsize old_n_wc = n_wc; if (wc >= 0xac00 && wc <= 0xd7af) { gsize result_len; decompose_hangul (wc, wc_buffer + n_wc, &result_len); n_wc += result_len; } else { decomp = find_decomposition (wc, do_compat); if (decomp) { const char *pd; for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) wc_buffer[n_wc++] = g_utf8_get_char (pd); } else wc_buffer[n_wc++] = wc; } if (n_wc > 0) { cc = COMBINING_CLASS (wc_buffer[old_n_wc]); if (cc == 0) { g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); last_start = old_n_wc; } } p = g_utf8_next_char (p); } if (n_wc > 0) { g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); last_start = n_wc; } wc_buffer[n_wc] = 0; /* All decomposed and reordered */ if (do_compose && n_wc > 0) { gsize i, j; int last_cc = 0; last_start = 0; for (i = 0; i < n_wc; i++) { int cc = COMBINING_CLASS (wc_buffer[i]); if (i > 0 && (last_cc == 0 || last_cc != cc) && combine (wc_buffer[last_start], wc_buffer[i], &wc_buffer[last_start])) { for (j = i + 1; j < n_wc; j++) wc_buffer[j - 1] = wc_buffer[j]; n_wc--; i--; if (i == last_start) last_cc = 0; else last_cc = COMBINING_CLASS (wc_buffer[i - 1]); continue; } if (cc == 0) last_start = i; last_cc = cc; } } wc_buffer[n_wc] = 0; return wc_buffer; }
/** * g_unichar_combining_class: * @uc: a Unicode character * * Determines the canonical combining class of a Unicode character. * * Return value: the combining class of the character * * Since: 2.14 **/ gint g_unichar_combining_class (gunichar uc) { return COMBINING_CLASS (uc); }
/* {{{1 * Return the combinging class of ‘c’. */ inline int _unichar_combining_class(unichar c) { return COMBINING_CLASS(c); }
/* {{{1 * Normalize (compose/decompose) characters in ‘str˚ so that strings that * actually contain the same characters will be recognized as equal for * comparison for example. */ unichar * _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode) { bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD); bool do_compose = (mode = NORMALIZE_NFC || mode == NORMALIZE_NFKC); size_t n = 0; const char *p = str; while ((!use_len || p < str + max_len) && *p != NUL) { unichar c = utf_char(p); if (c >= 0xac00 && c <= 0xd7af) { size_t len; decompose_hangul(c, NULL, &len); n += len; } else { const char *decomp = find_decomposition(c, do_compat); n += (decomp != NULL) ? utf_length(decomp) : 1; } p = utf_next(p); } unichar *buf = ALLOC_N(unichar, n + 1); size_t prev_start; for (p = str, prev_start = 0, n = 0; (!use_len || p < str + max_len) && *p != NUL; p = utf_next(p)) { unichar c = utf_char(p); size_t prev_n = n; if (c >= 0xac00 && c <= 0xd7af) { size_t len; decompose_hangul(c, buf + n, &len); n += len; } else { const char *decomp = find_decomposition(c, do_compat); if (decomp != NULL) { for ( ; *decomp != NUL; decomp = utf_next(decomp)) buf[n++] = utf_char(decomp); } else { buf[n++] = c; } } if (n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) { unicode_canonical_ordering(buf + prev_start, n - prev_start); prev_start = prev_n; } } if (n > 0) { unicode_canonical_ordering(buf + prev_start, n - prev_start); prev_start = n; } buf[n] = NUL; /* done with decomposition and reordering */ if (do_compose && n > 0) { prev_start = 0; int prev_cc = 0; for (size_t i = 0; i < n; i++) { int cc = COMBINING_CLASS(buf[i]); if (i > 0 && (prev_cc == 0 || prev_cc != cc) && combine(buf[prev_start], buf[i], &buf[prev_start])) { for (size_t j = i + 1; j < n; j++) buf[j - 1] = buf[j]; n--; i--; prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]); } else { if (cc == 0) prev_start = i; prev_cc = cc; } } buf[n] = NUL; } return buf; }