Esempio n. 1
0
/* convert data with the specified encoding */
static gboolean
handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
{
	GeanyEncodingIndex enc_idx;

	if (utils_str_equal(forced_enc, "UTF-8"))
	{
		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
		{
			return FALSE;
		}
	}
	else
	{
		gchar *converted_text = encodings_convert_to_utf8_from_charset(
										buffer->data, buffer->size, forced_enc, FALSE);
		if (converted_text == NULL)
		{
			return FALSE;
		}
		else
		{
			setptr(buffer->data, converted_text);
			buffer->len = strlen(converted_text);
		}
	}
	enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
	buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
	buffer->enc = g_strdup(forced_enc);
	return TRUE;
}
Esempio n. 2
0
static PyObject *
Encodings_convert_to_utf8_from_charset(PyObject *module, PyObject *args, PyObject *kwargs)
{
    gchar *buffer = NULL, *charset = NULL, *new_buffer = NULL;
    gssize size = -1;
    gint fast = 0;
    PyObject *result;
    static gchar *kwlist[] = { "buffer", "size", "charset", "fast", NULL };

    if (PyArg_ParseTupleAndKeywords(args, kwargs, "ss|li", kwlist, &buffer,
		&charset, &size, &fast))
    {
        new_buffer = encodings_convert_to_utf8_from_charset(buffer, size, charset, fast);
        if (new_buffer != NULL)
        {
            result = Py_BuildValue("s", new_buffer);
            g_free(new_buffer);
            return result;
        }
    }

    Py_RETURN_NONE;
}
Esempio n. 3
0
/* detect encoding and convert to UTF-8 if necessary */
static gboolean
handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
{
	g_return_val_if_fail(buffer->enc == NULL, FALSE);
	g_return_val_if_fail(buffer->bom == FALSE, FALSE);

	if (buffer->size == 0)
	{
		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
		buffer->enc = g_strdup("UTF-8");
	}
	else
	{
		/* first check for a BOM */
		if (enc_idx != GEANY_ENCODING_NONE)
		{
			buffer->enc = g_strdup(encodings[enc_idx].charset);
			buffer->bom = TRUE;

			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
			{
				gchar *converted_text = encodings_convert_to_utf8_from_charset(
										buffer->data, buffer->size, buffer->enc, FALSE);
				if (converted_text != NULL)
				{
					setptr(buffer->data, converted_text);
					buffer->len = strlen(converted_text);
				}
				else
				{
					/* there was a problem converting data from BOM encoding type */
					setptr(buffer->enc, NULL);
					buffer->bom = FALSE;
				}
			}
		}

		if (buffer->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
		{
			/* first try to read the encoding from the file content */
			gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);

			/* try UTF-8 first */
			if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
			{
				buffer->enc = g_strdup("UTF-8");
			}
			else
			{
				/* detect the encoding */
				gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
					buffer->size, regex_charset, &buffer->enc);

				if (converted_text == NULL)
				{
					g_free(regex_charset);
					return FALSE;
				}
				setptr(buffer->data, converted_text);
				buffer->len = strlen(converted_text);
			}
			g_free(regex_charset);
		}
	}
	return TRUE;
}
Esempio n. 4
0
static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size,
		const gchar *suggested_charset, gchar **used_encoding)
{
	const gchar *locale_charset = NULL;
	const gchar *charset;
	gchar *utf8_content;
	gboolean check_suggestion = suggested_charset != NULL;
	gboolean check_locale = FALSE;
	gint i, preferred_charset;

	if ((gint)size == -1)
	{
		size = strlen(buffer);
	}

	/* current locale is not UTF-8, we have to check this charset */
	check_locale = ! g_get_charset(&locale_charset);

	/* First check for preferred charset, if specified */
	preferred_charset = file_prefs.default_open_encoding;

	if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx ||
		preferred_charset < 0 ||
		preferred_charset >= GEANY_ENCODINGS_MAX)
	{
		preferred_charset = -1;
	}

	/* -1 means "Preferred charset" */
	for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
	{
		if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
			continue;

		if (check_suggestion)
		{
			check_suggestion = FALSE;
			charset = encodings_normalize_charset(suggested_charset);
			if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
				charset = suggested_charset;
			i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
		}
		else if (check_locale)
		{
			check_locale = FALSE;
			charset = locale_charset;
			i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
		}
		else if (i == -1)
		{
			if (preferred_charset >= 0)
			{
				charset = encodings[preferred_charset].charset;
				geany_debug("Using preferred charset: %s", charset);
			}
			else
				continue;
		}
		else if (i >= 0)
			charset = encodings[i].charset;
		else /* in this case we have i == -2, continue to increase i and go ahead */
			continue;

		if (G_UNLIKELY(charset == NULL))
			continue;

		geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
			size, charset);
		utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);

		if (G_LIKELY(utf8_content != NULL))
		{
			if (used_encoding != NULL)
			{
				if (G_UNLIKELY(*used_encoding != NULL))
				{
					geany_debug("%s:%d", __FILE__, __LINE__);
					g_free(*used_encoding);
				}
				*used_encoding = g_strdup(charset);
			}
			return utf8_content;
		}
	}

	return NULL;
}