/* {{{1 * Generate the canonical decomposition of ‘c’. The length of the * decomposition is stored in ‘r_len’. */ unichar * unicode_canonical_decomposition(unichar c, size_t *len) { const char *decomp; unichar *r; /* Hangul syllable */ if (c >= 0xac00 && c <= 0xd7af) { decompose_hangul(c, NULL, len); r = ALLOC_N(unichar, *len); decompose_hangul(c, r, len); } else if ((decomp = find_decomposition(c, false)) != NULL) { *len = utf_length(decomp); r = ALLOC_N(unichar, *len); int i; const char *p; for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++) r[i] = utf_char(p); } else { r = ALLOC(unichar); *r = c; *len = 1; } /* Supposedly following the Unicode 2.1.9 table means that the * decompositions come out in canonical order. I haven't tested this, * but we rely on it here. */ return r; }
str_t utf_convert(heap_t h, str_t src, int src_enc, int dest_enc) { str_t res = str_create(h, utf_length(src, src_enc)*4); str_it_t src_i = str_begin(src), src_e = str_end(src), res_i = str_begin(res); while(src_i<src_e) { long c = CHAR_TO_LONG(src_i, src_enc); src_i += CHAR_LEN(src_i, src_enc); res_i = LONG_TO_UTF(c, res_i, dest_enc); } res->length = res_i - str_begin(res); *res_i = 0; return res; }
void *utf_convert(heap_t h, void *src, int src_enc, int dest_enc) { void *res; void *i; int len; if(!src) return 0; err_reset(); len = utf_length(src, src_enc) + 1; i = res = heap_alloc(h, len*4); if(err()) return 0; while(!CHAR_IS_LAST(src, src_enc)) { long c = CHAR_TO_LONG(src, src_enc); src = C_OFFSET(src, CHAR_LEN(src, src_enc)); i = LONG_TO_UTF(c, i, dest_enc); } i = UTF_WRITE4(i, 0, 0, 0, 0); return res; }
/* {{{1 * Normalize (compose/decompose) characters in ‘str˚ so that strings that * actually contain the same characters will be recognized as equal for * comparison for example. */ unichar * _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode) { bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD); bool do_compose = (mode = NORMALIZE_NFC || mode == NORMALIZE_NFKC); size_t n = 0; const char *p = str; while ((!use_len || p < str + max_len) && *p != NUL) { unichar c = utf_char(p); if (c >= 0xac00 && c <= 0xd7af) { size_t len; decompose_hangul(c, NULL, &len); n += len; } else { const char *decomp = find_decomposition(c, do_compat); n += (decomp != NULL) ? utf_length(decomp) : 1; } p = utf_next(p); } unichar *buf = ALLOC_N(unichar, n + 1); size_t prev_start; for (p = str, prev_start = 0, n = 0; (!use_len || p < str + max_len) && *p != NUL; p = utf_next(p)) { unichar c = utf_char(p); size_t prev_n = n; if (c >= 0xac00 && c <= 0xd7af) { size_t len; decompose_hangul(c, buf + n, &len); n += len; } else { const char *decomp = find_decomposition(c, do_compat); if (decomp != NULL) { for ( ; *decomp != NUL; decomp = utf_next(decomp)) buf[n++] = utf_char(decomp); } else { buf[n++] = c; } } if (n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) { unicode_canonical_ordering(buf + prev_start, n - prev_start); prev_start = prev_n; } } if (n > 0) { unicode_canonical_ordering(buf + prev_start, n - prev_start); prev_start = n; } buf[n] = NUL; /* done with decomposition and reordering */ if (do_compose && n > 0) { prev_start = 0; int prev_cc = 0; for (size_t i = 0; i < n; i++) { int cc = COMBINING_CLASS(buf[i]); if (i > 0 && (prev_cc == 0 || prev_cc != cc) && combine(buf[prev_start], buf[i], &buf[prev_start])) { for (size_t j = i + 1; j < n; j++) buf[j - 1] = buf[j]; n--; i--; prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]); } else { if (cc == 0) prev_start = i; prev_cc = cc; } } buf[n] = NUL; } return buf; }