コード例 #1
0
ファイル: decompose.c プロジェクト: now/ned
/* {{{1
 * Rearrange ‘str’ so that decomposed characters are arranged according to
 * their combining class.  Do this for at most ‘len’ bytes of data.
 */
void
unicode_canonical_ordering(unichar *str, size_t len)
{
	bool swapped = true;

	while (swapped) {
		swapped = false;

		int prev = COMBINING_CLASS(str[0]);

		for (size_t i = 0; i < len - 1; i++) {
			int next = COMBINING_CLASS(str[i + 1]);

			if (next != 0 && prev > next) {
				for (size_t j = i + 1; j > 0 && COMBINING_CLASS(str[j - 1]) <= next; j--) {
					unichar c = str[j];
					str[j] = str[j - 1];
					str[j - 1] = c;
					swapped = true;
				}

				next = prev;
			}

			prev = next;
		}
	}
}
コード例 #2
0
ファイル: gunidecomp.c プロジェクト: 01org/android-bluez-glib
/**
 * g_unicode_canonical_ordering:
 * @string: a UCS-4 encoded string.
 * @len: the maximum length of @string to use.
 *
 * Computes the canonical ordering of a string in-place.  
 * This rearranges decomposed characters in the string 
 * according to their combining classes.  See the Unicode 
 * manual for more information. 
 **/
void
g_unicode_canonical_ordering (gunichar *string,
			      gsize     len)
{
  gsize i;
  int swap = 1;

  while (swap)
    {
      int last;
      swap = 0;
      last = COMBINING_CLASS (string[0]);
      for (i = 0; i < len - 1; ++i)
	{
	  int next = COMBINING_CLASS (string[i + 1]);
	  if (next != 0 && last > next)
	    {
	      gsize j;
	      /* Percolate item leftward through string.  */
	      for (j = i + 1; j > 0; --j)
		{
		  gunichar t;
		  if (COMBINING_CLASS (string[j - 1]) <= next)
		    break;
		  t = string[j];
		  string[j] = string[j - 1];
		  string[j - 1] = t;
		  swap = 1;
		}
	      /* We're re-entering the loop looking at the old
		 character again.  */
	      next = last;
	    }
	  last = next;
	}
    }
}
コード例 #3
0
ファイル: nfkc.c プロジェクト: AlekSi/Jabbin
static gunichar *
_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
{
  gsize n_wc;
  gunichar *wc_buffer;
  const char *p;
  gsize last_start;
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);

  n_wc = 0;
  p = str;
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      const gchar *decomp;
      gunichar wc = g_utf8_get_char (p);

      if (wc >= 0xac00 && wc <= 0xd7af)
	{
	  gsize result_len;
	  decompose_hangul (wc, NULL, &result_len);
	  n_wc += result_len;
	}
      else
	{
	  decomp = find_decomposition (wc, do_compat);

	  if (decomp)
	    n_wc += g_utf8_strlen (decomp, -1);
	  else
	    n_wc++;
	}

      p = g_utf8_next_char (p);
    }

  wc_buffer = g_new (gunichar, n_wc + 1);

  last_start = 0;
  n_wc = 0;
  p = str;
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      gunichar wc = g_utf8_get_char (p);
      const gchar *decomp;
      int cc;
      gsize old_n_wc = n_wc;

      if (wc >= 0xac00 && wc <= 0xd7af)
	{
	  gsize result_len;
	  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
	  n_wc += result_len;
	}
      else
	{
	  decomp = find_decomposition (wc, do_compat);

	  if (decomp)
	    {
	      const char *pd;
	      for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
		wc_buffer[n_wc++] = g_utf8_get_char (pd);
	    }
	  else
	    wc_buffer[n_wc++] = wc;
	}

      if (n_wc > 0)
	{
	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);

	  if (cc == 0)
	    {
	      g_unicode_canonical_ordering (wc_buffer + last_start,
					    n_wc - last_start);
	      last_start = old_n_wc;
	    }
	}

      p = g_utf8_next_char (p);
    }

  if (n_wc > 0)
    {
      g_unicode_canonical_ordering (wc_buffer + last_start,
				    n_wc - last_start);
      last_start = n_wc;
    }

  wc_buffer[n_wc] = 0;

  /* All decomposed and reordered */

  if (do_compose && n_wc > 0)
    {
      gsize i, j;
      int last_cc = 0;
      last_start = 0;

      for (i = 0; i < n_wc; i++)
	{
	  int cc = COMBINING_CLASS (wc_buffer[i]);

	  if (i > 0 &&
	      (last_cc == 0 || last_cc != cc) &&
	      combine (wc_buffer[last_start], wc_buffer[i],
		       &wc_buffer[last_start]))
	    {
	      for (j = i + 1; j < n_wc; j++)
		wc_buffer[j - 1] = wc_buffer[j];
	      n_wc--;
	      i--;

	      if (i == last_start)
		last_cc = 0;
	      else
		last_cc = COMBINING_CLASS (wc_buffer[i - 1]);

	      continue;
	    }

	  if (cc == 0)
	    last_start = i;

	  last_cc = cc;
	}
    }

  wc_buffer[n_wc] = 0;

  return wc_buffer;
}
コード例 #4
0
ファイル: gunidecomp.c プロジェクト: 01org/android-bluez-glib
/**
 * g_unichar_combining_class:
 * @uc: a Unicode character
 * 
 * Determines the canonical combining class of a Unicode character.
 * 
 * Return value: the combining class of the character
 *
 * Since: 2.14
 **/
gint
g_unichar_combining_class (gunichar uc)
{
  return COMBINING_CLASS (uc);
}
コード例 #5
0
ファイル: decompose.c プロジェクト: now/ned
/* {{{1
 * Return the combinging class of ‘c’.
 */
inline int
_unichar_combining_class(unichar c)
{
	return COMBINING_CLASS(c);
}
コード例 #6
0
ファイル: decompose.c プロジェクト: now/ned
/* {{{1
 * Normalize (compose/decompose) characters in ‘str˚ so that strings that
 * actually contain the same characters will be recognized as equal for
 * comparison for example.
 */
unichar *
_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
{
	bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD);
	bool do_compose = (mode = NORMALIZE_NFC || mode == NORMALIZE_NFKC);

	size_t n = 0;
	const char *p = str;
	while ((!use_len || p < str + max_len) && *p != NUL) {
		unichar c = utf_char(p);

		if (c >= 0xac00 && c <= 0xd7af) {
			size_t len;

			decompose_hangul(c, NULL, &len);
			n += len;
		} else {
			const char *decomp = find_decomposition(c, do_compat);

			n += (decomp != NULL) ? utf_length(decomp) : 1;
		}

		p = utf_next(p);
	}

	unichar *buf = ALLOC_N(unichar, n + 1);
	size_t prev_start;
	for (p = str, prev_start = 0, n = 0; (!use_len || p < str + max_len) && *p != NUL; p = utf_next(p)) {
		unichar c = utf_char(p);
		size_t prev_n = n;

		if (c >= 0xac00 && c <= 0xd7af) {
			size_t len;

			decompose_hangul(c, buf + n, &len);
			n += len;
		} else {
			const char *decomp = find_decomposition(c, do_compat);

			if (decomp != NULL) {
				for ( ; *decomp != NUL; decomp = utf_next(decomp))
					buf[n++] = utf_char(decomp);
			} else {
				buf[n++] = c;
			}
		}

		if (n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
			unicode_canonical_ordering(buf + prev_start, n - prev_start);
			prev_start = prev_n;
		}
	}

	if (n > 0) {
		unicode_canonical_ordering(buf + prev_start, n - prev_start);
		prev_start = n;
	}

	buf[n] = NUL;

	/* done with decomposition and reordering */

	if (do_compose && n > 0) {
		prev_start = 0;
		int prev_cc = 0;
		for (size_t i = 0; i < n; i++) {
			int cc = COMBINING_CLASS(buf[i]);

			if (i > 0 && (prev_cc == 0 || prev_cc != cc) && combine(buf[prev_start], buf[i], &buf[prev_start])) {
				for (size_t j = i + 1; j < n; j++)
					buf[j - 1] = buf[j];

				n--;
				i--;
				prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
			} else {
				if (cc == 0)
					prev_start = i;

				prev_cc = cc;
			}
		}

		buf[n] = NUL;
	}

	return buf;
}