コード例 #1
0
ファイル: utils.cpp プロジェクト: Anthonyfeng/fcitx-qt5
int
_utf8_get_char_extended(const char *s,
                             int max_len)
{
    const unsigned char*p = (const unsigned char*)s;
    int i, len;
    unsigned int wc = (unsigned char) * p;

    if (wc < 0x80) {
        return wc;
    } else if (wc < 0xc0) {
        return (unsigned int) - 1;
    } else if (wc < 0xe0) {
        len = 2;
        wc &= 0x1f;
    } else if (wc < 0xf0) {
        len = 3;
        wc &= 0x0f;
    } else if (wc < 0xf8) {
        len = 4;
        wc &= 0x07;
    } else if (wc < 0xfc) {
        len = 5;
        wc &= 0x03;
    } else if (wc < 0xfe) {
        len = 6;
        wc &= 0x01;
    } else {
        return (unsigned int) - 1;
    }

    if (max_len >= 0 && len > max_len) {
        for (i = 1; i < max_len; i++) {
            if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
                return (unsigned int) - 1;
        }

        return (unsigned int) - 2;
    }

    for (i = 1; i < len; ++i) {
        unsigned int ch = ((unsigned char *)p)[i];

        if ((ch & 0xc0) != 0x80) {
            if (ch)
                return (unsigned int) - 1;
            else
                return (unsigned int) - 2;
        }

        wc <<= 6;

        wc |= (ch & 0x3f);
    }

    if (UTF8_LENGTH(wc) != len)
        return (unsigned int) - 1;

    return wc;
}
コード例 #2
0
static void test_valid_turkish()
{
	long nwritten;
	long nread;
	char *res;
	int i;
	long size;
	gunichar *verify;
	unsigned char *back;

	unsigned char buf[2];

	static int map_size =
		sizeof(gsm_turkish_to_unicode_map) / sizeof(unsigned short) / 2;

	for (i = 0; i < map_size; i++) {
		unsigned short c = gsm_turkish_to_unicode_map[i*2];

		if (c & 0x1b00) {
			buf[0] = 0x1b;
			buf[1] = c & 0x7f;
			size = 2;
		} else {
			size = 1;
			buf[0] = c & 0x7f;
		}

		res = convert_gsm_to_utf8_with_lang(buf, size, &nread, &nwritten, 0, 1, 1);
		g_assert(res);

		if (g_test_verbose())
			g_print("size: %ld, nread:%ld, nwritten:%ld, %s\n",
				size, nread, nwritten, res);

		g_assert(nread == size);

		verify = g_utf8_to_ucs4(res, -1, NULL, NULL, NULL);

		g_assert(verify[0] == gsm_turkish_to_unicode_map[i*2+1]);
		g_assert(verify[1] == 0);

		g_assert(nwritten == UTF8_LENGTH(verify[0]));

		back = convert_utf8_to_gsm_with_lang(res, -1, &nread, &nwritten, 0, 1, 1);

		g_assert(back);

		g_assert(nwritten == size);
		if (c & 0x1b00) {
			g_assert(back[0] == 0x1b);
			g_assert(back[1] == (c & 0x7f));
		} else {
			g_assert(back[0] == (c & 0x7f));
		}

		g_free(back);
		g_free(verify);
		g_free(res);
	}
}
コード例 #3
0
ファイル: nfkc.c プロジェクト: Reve/Shakespeer
/*
 * g_ucs4_to_utf8:
 * @str: a UCS-4 encoded string
 * @len: the maximum length of @str to use. If @len < 0, then
 *       the string is terminated with a 0 character.
 * @items_read: location to store number of characters read read, or %NULL.
 * @items_written: location to store number of bytes written or %NULL.
 *                 The value here stored does not include the trailing 0
 *                 byte.
 * @error: location to store the error occuring, or %NULL to ignore
 *         errors. Any of the errors in #GConvertError other than
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 *
 * Convert a string from a 32-bit fixed width representation as UCS-4.
 * to UTF-8. The result will be terminated with a 0 byte.
 *
 * Return value: a pointer to a newly allocated UTF-8 string.
 *               This value must be freed with g_free(). If an
 *               error occurs, %NULL will be returned and
 *               @error set.
 **/
gchar *
g_ucs4_to_utf8 (const gunichar * str,
		glong len,
		glong * items_read, glong * items_written, GError ** error)
{
  gint result_length;
  gchar *result = NULL;
  gchar *p;
  gint i;

  result_length = 0;
  for (i = 0; len < 0 || i < len; i++)
    {
      if (!str[i])
	break;

      if (str[i] >= 0x80000000)
	{
	  if (items_read)
	    *items_read = i;

	  g_set_error (error, G_CONVERT_ERROR,
		       G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
		       _("Character out of range for UTF-8"));
	  goto err_out;
	}

      result_length += UTF8_LENGTH (str[i]);
    }

  result = g_malloc (result_length + 1);
  if (!result)
    return NULL;
  p = result;

  i = 0;
  while (p < result + result_length)
    p += g_unichar_to_utf8 (str[i++], p);

  *p = '\0';

  if (items_written)
    *items_written = p - result;

err_out:
  if (items_read)
    *items_read = i;

  return result;
}
コード例 #4
0
ファイル: unicode.cpp プロジェクト: dbluelle/pandora-einstein
/**
 * g_ucs4_to_utf8:
 * @str: a UCS-4 encoded string
 * @len: the maximum length of @str to use. If @len < 0, then
 *       the string is terminated with a 0 character.
 * @items_read: location to store number of characters read read, or %NULL.
 * @items_written: location to store number of bytes written or %NULL.
 *                 The value here stored does not include the trailing 0
 *                 byte. 
 * @error: location to store the error occuring, or %NULL to ignore
 *         errors. Any of the errors in #GConvertError other than
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 *
 * Convert a string from a 32-bit fixed width representation as UCS-4.
 * to UTF-8. The result will be terminated with a 0 byte.
 * 
 * Return value: a pointer to a newly allocated UTF-8 string.
 *               This value must be freed with g_free(). If an
 *               error occurs, %NULL will be returned and
 *               @error set.
 **/
char *
g_ucs4_to_utf8 (const wchar_t *str,
		long           len,              
		long          *items_read,       
		long          *items_written,    
		wchar_t       **error)
{
  int result_length;
  char *result = NULL;
  char *p;
  int i;

  result_length = 0;
  for (i = 0; len < 0 || i < len ; i++)
    {
      if (!str[i])
	break;

      if ((unsigned)str[i] >= 0x80000000)
	{
	  if (items_read)
	    *items_read = i;
          if (error)
              *error = L"Character out of range for UTF-8";
	  goto err_out;
	}
      
      result_length += UTF8_LENGTH (str[i]);
    }

  result = (char*)malloc (result_length + 1);
  p = result;

  i = 0;
  while (p < result + result_length)
    p += g_unichar_to_utf8 (str[i++], p);
  
  *p = '\0';

  if (items_written)
    *items_written = p - result;

 err_out:
  if (items_read)
    *items_read = i;

  return result;
}
コード例 #5
0
ファイル: nfkc.c プロジェクト: stela/libidn
/*
 * g_ucs4_to_utf8:
 * @str: a UCS-4 encoded string
 * @len: the maximum length (number of characters) of @str to use.
 *       If @len < 0, then the string is nul-terminated.
 * @items_read: location to store number of characters read, or %NULL.
 * @items_written: location to store number of bytes written or %NULL.
 *                 The value here stored does not include the trailing 0
 *                 byte.
 * @error: location to store the error occurring, or %NULL to ignore
 *         errors. Any of the errors in #GConvertError other than
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 *
 * Convert a string from a 32-bit fixed width representation as UCS-4.
 * to UTF-8. The result will be terminated with a 0 byte.
 *
 * Return value: a pointer to a newly allocated UTF-8 string.
 *               This value must be freed with g_free(). If an
 *               error occurs, %NULL will be returned and
 *               @error set. In that case, @items_read will be
 *               set to the position of the first invalid input
 *               character.
 **/
static gchar *
g_ucs4_to_utf8 (const gunichar * str,
		glong len,
		glong * items_read, glong * items_written)
{
  gint result_length;
  gchar *result = NULL;
  gchar *p;
  gint i;

  result_length = 0;
  for (i = 0; len < 0 || i < len; i++)
    {
      if (!str[i])
	break;

      if (str[i] >= 0x80000000)
	goto err_out;

      result_length += UTF8_LENGTH (str[i]);
    }

  result = g_malloc (result_length + 1);
  if (!result)
    return NULL;
  p = result;

  i = 0;
  while (p < result + result_length)
    p += g_unichar_to_utf8 (str[i++], p);

  *p = '\0';

  if (items_written)
    *items_written = p - result;

err_out:
  if (items_read)
    *items_read = i;

  return result;
}
コード例 #6
0
ファイル: ccUTF8.cpp プロジェクト: 6520874/pipiGame
/**
 * cc_utf16_to_utf8:
 * @str: a UTF-16 encoded string
 * @len: the maximum length of @str to use. If @len < 0, then
 *       the string is terminated with a 0 character.
 * @items_read: location to store number of words read, or %nullptr.
 *              If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 *              returned in case @str contains a trailing partial
 *              character. If an error occurs then the index of the
 *              invalid input is stored here.
 * @items_written: location to store number of bytes written, or %nullptr.
 *                 The value stored here does not include the trailing
 *                 0 byte.
 * @error: location to store the error occuring, or %nullptr to ignore
 *         errors. Any of the errors in #GConvertError other than
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 *
 * Convert a string from UTF-16 to UTF-8. The result will be
 * terminated with a 0 byte.
 *
 * Return value: a pointer to a newly allocated UTF-8 string.
 *               This value must be freed with free(). If an
 *               error occurs, %nullptr will be returned and
 *               @error set.
 **/
char *
cc_utf16_to_utf8 (const unsigned short  *str,
                  int             len,
                  long            *items_read,
                  long            *items_written)
{
    /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
     * are marked.
     */
    const unsigned short *in;
    char *out;
    char *result = nullptr;
    int n_bytes;
    unsigned int high_surrogate;

    if (str == 0) return nullptr;

    n_bytes = 0;
    in = str;
    high_surrogate = 0;
    while ((len < 0 || in - str < len) && *in)
    {
        unsigned short c = *in;
        unsigned int wc;

        if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
        {
            if (high_surrogate)
            {
                wc = SURROGATE_VALUE (high_surrogate, c);
                high_surrogate = 0;
            }
            else
            {
                CCLOGERROR("Invalid sequence in conversion input");
                goto err_out;
            }
        }
        else
        {
            if (high_surrogate)
            {
                CCLOGERROR("Invalid sequence in conversion input");
                goto err_out;
            }

            if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
            {
                high_surrogate = c;
                goto next1;
            }
            else
                wc = c;
        }

        /********** DIFFERENT for UTF8/UCS4 **********/
        n_bytes += UTF8_LENGTH (wc);

next1:
        in++;
    }

    if (high_surrogate && !items_read)
    {
        CCLOGERROR("Partial character sequence at end of input");
        goto err_out;
    }

    /* At this point, everything is valid, and we just need to convert
     */
    /********** DIFFERENT for UTF8/UCS4 **********/
    result = new char[n_bytes + 1];

    high_surrogate = 0;
    out = result;
    in = str;
    while (out < result + n_bytes)
    {
        unsigned short c = *in;
        unsigned int wc;

        if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
        {
            wc = SURROGATE_VALUE (high_surrogate, c);
            high_surrogate = 0;
        }
        else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
        {
            high_surrogate = c;
            goto next2;
        }
        else
            wc = c;

        /********** DIFFERENT for UTF8/UCS4 **********/
        out += cc_unichar_to_utf8 (wc, out);

next2:
        in++;
    }

    /********** DIFFERENT for UTF8/UCS4 **********/
    *out = '\0';

    if (items_written)
        /********** DIFFERENT for UTF8/UCS4 **********/
        *items_written = out - result;

err_out:
    if (items_read)
        *items_read = in - str;

    return result;
}
コード例 #7
0
ファイル: gutf8.c プロジェクト: cosimoc/glib
/**
 * g_utf16_to_utf8:
 * @str: a UTF-16 encoded string
 * @len: the maximum length (number of #gunichar2) of @str to use. 
 *     If @len < 0, then the string is nul-terminated.
 * @items_read: (out caller-allocates) (optional): location to store number of
 *     words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
 *     be returned in case @str contains a trailing partial character. If
 *     an error occurs then the index of the invalid input is stored here.
 * @items_written: (out caller-allocates) (optional): location to store number
 *     of bytes written, or %NULL. The value stored here does not include the
 *     trailing 0 byte.
 * @error: location to store the error occurring, or %NULL to ignore
 *     errors. Any of the errors in #GConvertError other than
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
 *
 * Convert a string from UTF-16 to UTF-8. The result will be
 * terminated with a 0 byte.
 *
 * Note that the input is expected to be already in native endianness,
 * an initial byte-order-mark character is not handled specially.
 * g_convert() can be used to convert a byte buffer of UTF-16 data of
 * ambiguous endianess.
 *
 * Further note that this function does not validate the result
 * string; it may e.g. include embedded NUL characters. The only
 * validation done by this function is to ensure that the input can
 * be correctly interpreted as UTF-16, i.e. it doesn't contain
 * things unpaired surrogates.
 *
 * Returns: a pointer to a newly allocated UTF-8 string.
 *     This value must be freed with g_free(). If an error occurs,
 *     %NULL will be returned and @error set.
 **/
gchar *
g_utf16_to_utf8 (const gunichar2  *str,
		 glong             len,
		 glong            *items_read,
		 glong            *items_written,
		 GError          **error)
{
  /* This function and g_utf16_to_ucs4 are almost exactly identical -
   * The lines that differ are marked.
   */
  const gunichar2 *in;
  gchar *out;
  gchar *result = NULL;
  gint n_bytes;
  gunichar high_surrogate;

  g_return_val_if_fail (str != NULL, NULL);

  n_bytes = 0;
  in = str;
  high_surrogate = 0;
  while ((len < 0 || in - str < len) && *in)
    {
      gunichar2 c = *in;
      gunichar wc;

      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
	{
	  if (high_surrogate)
	    {
	      wc = SURROGATE_VALUE (high_surrogate, c);
	      high_surrogate = 0;
	    }
	  else
	    {
	      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                                   _("Invalid sequence in conversion input"));
	      goto err_out;
	    }
	}
      else
	{
	  if (high_surrogate)
	    {
	      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                                   _("Invalid sequence in conversion input"));
	      goto err_out;
	    }

	  if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
	    {
	      high_surrogate = c;
	      goto next1;
	    }
	  else
	    wc = c;
	}

      /********** DIFFERENT for UTF8/UCS4 **********/
      n_bytes += UTF8_LENGTH (wc);

    next1:
      in++;
    }

  if (high_surrogate && !items_read)
    {
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
                           _("Partial character sequence at end of input"));
      goto err_out;
    }
  
  /* At this point, everything is valid, and we just need to convert
   */
  /********** DIFFERENT for UTF8/UCS4 **********/
  result = try_malloc_n (n_bytes + 1, 1, error);
  if (result == NULL)
      goto err_out;

  high_surrogate = 0;
  out = result;
  in = str;
  while (out < result + n_bytes)
    {
      gunichar2 c = *in;
      gunichar wc;

      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
	{
	  wc = SURROGATE_VALUE (high_surrogate, c);
	  high_surrogate = 0;
	}
      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
	{
	  high_surrogate = c;
	  goto next2;
	}
      else
	wc = c;

      /********** DIFFERENT for UTF8/UCS4 **********/
      out += g_unichar_to_utf8 (wc, out);

    next2:
      in++;
    }
  
  /********** DIFFERENT for UTF8/UCS4 **********/
  *out = '\0';

  if (items_written)
    /********** DIFFERENT for UTF8/UCS4 **********/
    *items_written = out - result;

 err_out:
  if (items_read)
    *items_read = in - str;

  return result;
}
コード例 #8
0
ファイル: nfkc.c プロジェクト: Reve/Shakespeer
/* Like g_utf8_get_char, but take a maximum length
 * and return (gunichar)-2 on incomplete trailing character
 */
static inline gunichar
g_utf8_get_char_extended (const  gchar *p,
			  gssize max_len)  
{
  guint i, len;
  gunichar wc = (guchar) *p;

  if (wc < 0x80)
    {
      return wc;
    }
  else if (wc < 0xc0)
    {
      return (gunichar)-1;
    }
  else if (wc < 0xe0)
    {
      len = 2;
      wc &= 0x1f;
    }
  else if (wc < 0xf0)
    {
      len = 3;
      wc &= 0x0f;
    }
  else if (wc < 0xf8)
    {
      len = 4;
      wc &= 0x07;
    }
  else if (wc < 0xfc)
    {
      len = 5;
      wc &= 0x03;
    }
  else if (wc < 0xfe)
    {
      len = 6;
      wc &= 0x01;
    }
  else
    {
      return (gunichar)-1;
    }
  
  if (max_len >= 0 && len > max_len)
    {
      for (i = 1; i < max_len; i++)
	{
	  if ((((guchar *)p)[i] & 0xc0) != 0x80)
	    return (gunichar)-1;
	}
      return (gunichar)-2;
    }

  for (i = 1; i < len; ++i)
    {
      gunichar ch = ((guchar *)p)[i];
      
      if ((ch & 0xc0) != 0x80)
	{
	  if (ch)
	    return (gunichar)-1;
	  else
	    return (gunichar)-2;
	}

      wc <<= 6;
      wc |= (ch & 0x3f);
    }

  if (UTF8_LENGTH(wc) != len)
    return (gunichar)-1;
  
  return wc;
}