예제 #1
0
파일: nfkc.c 프로젝트: AlekSi/Jabbin
static gunichar *
_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
{
  gsize n_wc;
  gunichar *wc_buffer;
  const char *p;
  gsize last_start;
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);

  n_wc = 0;
  p = str;
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      const gchar *decomp;
      gunichar wc = g_utf8_get_char (p);

      if (wc >= 0xac00 && wc <= 0xd7af)
	{
	  gsize result_len;
	  decompose_hangul (wc, NULL, &result_len);
	  n_wc += result_len;
	}
      else
	{
	  decomp = find_decomposition (wc, do_compat);

	  if (decomp)
	    n_wc += g_utf8_strlen (decomp, -1);
	  else
	    n_wc++;
	}

      p = g_utf8_next_char (p);
    }

  wc_buffer = g_new (gunichar, n_wc + 1);

  last_start = 0;
  n_wc = 0;
  p = str;
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      gunichar wc = g_utf8_get_char (p);
      const gchar *decomp;
      int cc;
      gsize old_n_wc = n_wc;

      if (wc >= 0xac00 && wc <= 0xd7af)
	{
	  gsize result_len;
	  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
	  n_wc += result_len;
	}
      else
	{
	  decomp = find_decomposition (wc, do_compat);

	  if (decomp)
	    {
	      const char *pd;
	      for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
		wc_buffer[n_wc++] = g_utf8_get_char (pd);
	    }
	  else
	    wc_buffer[n_wc++] = wc;
	}

      if (n_wc > 0)
	{
	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);

	  if (cc == 0)
	    {
	      g_unicode_canonical_ordering (wc_buffer + last_start,
					    n_wc - last_start);
	      last_start = old_n_wc;
	    }
	}

      p = g_utf8_next_char (p);
    }

  if (n_wc > 0)
    {
      g_unicode_canonical_ordering (wc_buffer + last_start,
				    n_wc - last_start);
      last_start = n_wc;
    }

  wc_buffer[n_wc] = 0;

  /* All decomposed and reordered */

  if (do_compose && n_wc > 0)
    {
      gsize i, j;
      int last_cc = 0;
      last_start = 0;

      for (i = 0; i < n_wc; i++)
	{
	  int cc = COMBINING_CLASS (wc_buffer[i]);

	  if (i > 0 &&
	      (last_cc == 0 || last_cc != cc) &&
	      combine (wc_buffer[last_start], wc_buffer[i],
		       &wc_buffer[last_start]))
	    {
	      for (j = i + 1; j < n_wc; j++)
		wc_buffer[j - 1] = wc_buffer[j];
	      n_wc--;
	      i--;

	      if (i == last_start)
		last_cc = 0;
	      else
		last_cc = COMBINING_CLASS (wc_buffer[i - 1]);

	      continue;
	    }

	  if (cc == 0)
	    last_start = i;

	  last_cc = cc;
	}
    }

  wc_buffer[n_wc] = 0;

  return wc_buffer;
}
예제 #2
0
파일: decompose.c 프로젝트: now/ned
/* {{{1
 * Normalize (compose/decompose) characters in ‘str˚ so that strings that
 * actually contain the same characters will be recognized as equal for
 * comparison for example.
 */
unichar *
_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
{
	bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD);
	bool do_compose = (mode = NORMALIZE_NFC || mode == NORMALIZE_NFKC);

	size_t n = 0;
	const char *p = str;
	while ((!use_len || p < str + max_len) && *p != NUL) {
		unichar c = utf_char(p);

		if (c >= 0xac00 && c <= 0xd7af) {
			size_t len;

			decompose_hangul(c, NULL, &len);
			n += len;
		} else {
			const char *decomp = find_decomposition(c, do_compat);

			n += (decomp != NULL) ? utf_length(decomp) : 1;
		}

		p = utf_next(p);
	}

	unichar *buf = ALLOC_N(unichar, n + 1);
	size_t prev_start;
	for (p = str, prev_start = 0, n = 0; (!use_len || p < str + max_len) && *p != NUL; p = utf_next(p)) {
		unichar c = utf_char(p);
		size_t prev_n = n;

		if (c >= 0xac00 && c <= 0xd7af) {
			size_t len;

			decompose_hangul(c, buf + n, &len);
			n += len;
		} else {
			const char *decomp = find_decomposition(c, do_compat);

			if (decomp != NULL) {
				for ( ; *decomp != NUL; decomp = utf_next(decomp))
					buf[n++] = utf_char(decomp);
			} else {
				buf[n++] = c;
			}
		}

		if (n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
			unicode_canonical_ordering(buf + prev_start, n - prev_start);
			prev_start = prev_n;
		}
	}

	if (n > 0) {
		unicode_canonical_ordering(buf + prev_start, n - prev_start);
		prev_start = n;
	}

	buf[n] = NUL;

	/* done with decomposition and reordering */

	if (do_compose && n > 0) {
		prev_start = 0;
		int prev_cc = 0;
		for (size_t i = 0; i < n; i++) {
			int cc = COMBINING_CLASS(buf[i]);

			if (i > 0 && (prev_cc == 0 || prev_cc != cc) && combine(buf[prev_start], buf[i], &buf[prev_start])) {
				for (size_t j = i + 1; j < n; j++)
					buf[j - 1] = buf[j];

				n--;
				i--;
				prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
			} else {
				if (cc == 0)
					prev_start = i;

				prev_cc = cc;
			}
		}

		buf[n] = NUL;
	}

	return buf;
}