예제 #1
0
파일: decompose.c 프로젝트: now/ned
/* {{{1
 * Generate the canonical decomposition of ‘c’.  The length of the
 * decomposition is stored in ‘r_len’.
 */
unichar *
unicode_canonical_decomposition(unichar c, size_t *len)
{
	const char *decomp;
	unichar *r;

	/* Hangul syllable */
	if (c >= 0xac00 && c <= 0xd7af) {
		decompose_hangul(c, NULL, len);
		r = ALLOC_N(unichar, *len);
		decompose_hangul(c, r, len);
	} else if ((decomp = find_decomposition(c, false)) != NULL) {
		*len = utf_length(decomp);
		r = ALLOC_N(unichar, *len);

		int i;
		const char *p;
		for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
			r[i] = utf_char(p);
	} else {
		r = ALLOC(unichar);
		*r = c;
		*len = 1;
	}

	/* Supposedly following the Unicode 2.1.9 table means that the
	 * decompositions come out in canonical order.  I haven't tested this,
	 * but we rely on it here. */
	return r;
}
예제 #2
0
파일: utf.c 프로젝트: monkin/may_core
str_t utf_convert(heap_t h, str_t src, int src_enc, int dest_enc) {
	str_t res = str_create(h, utf_length(src, src_enc)*4);
	str_it_t src_i = str_begin(src),
		src_e = str_end(src),
		res_i = str_begin(res);
	while(src_i<src_e) {
		long c = CHAR_TO_LONG(src_i, src_enc);
		src_i += CHAR_LEN(src_i, src_enc);
		res_i = LONG_TO_UTF(c, res_i, dest_enc);
	}
	res->length = res_i - str_begin(res);
	*res_i = 0;
	return res;
}
예제 #3
0
파일: utf.c 프로젝트: monkin/xcss
void *utf_convert(heap_t h, void *src, int src_enc, int dest_enc) {
	void *res;
	void *i;
	int len;
	if(!src)
		return 0;
	err_reset();
	len = utf_length(src, src_enc) + 1;
	i = res = heap_alloc(h, len*4);
	if(err())
		return 0;
	while(!CHAR_IS_LAST(src, src_enc)) {
		long c = CHAR_TO_LONG(src, src_enc);
		src = C_OFFSET(src, CHAR_LEN(src, src_enc));
		i = LONG_TO_UTF(c, i, dest_enc);
	}
	i = UTF_WRITE4(i, 0, 0, 0, 0);
	return res;
}
예제 #4
0
파일: decompose.c 프로젝트: now/ned
/* {{{1
 * Normalize (compose/decompose) characters in ‘str˚ so that strings that
 * actually contain the same characters will be recognized as equal for
 * comparison for example.
 */
unichar *
_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
{
	bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD);
	bool do_compose = (mode = NORMALIZE_NFC || mode == NORMALIZE_NFKC);

	size_t n = 0;
	const char *p = str;
	while ((!use_len || p < str + max_len) && *p != NUL) {
		unichar c = utf_char(p);

		if (c >= 0xac00 && c <= 0xd7af) {
			size_t len;

			decompose_hangul(c, NULL, &len);
			n += len;
		} else {
			const char *decomp = find_decomposition(c, do_compat);

			n += (decomp != NULL) ? utf_length(decomp) : 1;
		}

		p = utf_next(p);
	}

	unichar *buf = ALLOC_N(unichar, n + 1);
	size_t prev_start;
	for (p = str, prev_start = 0, n = 0; (!use_len || p < str + max_len) && *p != NUL; p = utf_next(p)) {
		unichar c = utf_char(p);
		size_t prev_n = n;

		if (c >= 0xac00 && c <= 0xd7af) {
			size_t len;

			decompose_hangul(c, buf + n, &len);
			n += len;
		} else {
			const char *decomp = find_decomposition(c, do_compat);

			if (decomp != NULL) {
				for ( ; *decomp != NUL; decomp = utf_next(decomp))
					buf[n++] = utf_char(decomp);
			} else {
				buf[n++] = c;
			}
		}

		if (n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
			unicode_canonical_ordering(buf + prev_start, n - prev_start);
			prev_start = prev_n;
		}
	}

	if (n > 0) {
		unicode_canonical_ordering(buf + prev_start, n - prev_start);
		prev_start = n;
	}

	buf[n] = NUL;

	/* done with decomposition and reordering */

	if (do_compose && n > 0) {
		prev_start = 0;
		int prev_cc = 0;
		for (size_t i = 0; i < n; i++) {
			int cc = COMBINING_CLASS(buf[i]);

			if (i > 0 && (prev_cc == 0 || prev_cc != cc) && combine(buf[prev_start], buf[i], &buf[prev_start])) {
				for (size_t j = i + 1; j < n; j++)
					buf[j - 1] = buf[j];

				n--;
				i--;
				prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
			} else {
				if (cc == 0)
					prev_start = i;

				prev_cc = cc;
			}
		}

		buf[n] = NUL;
	}

	return buf;
}