/* convert data with the specified encoding */ static gboolean handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) { GeanyEncodingIndex enc_idx; if (utils_str_equal(forced_enc, "UTF-8")) { if (! g_utf8_validate(buffer->data, buffer->len, NULL)) { return FALSE; } } else { gchar *converted_text = encodings_convert_to_utf8_from_charset( buffer->data, buffer->size, forced_enc, FALSE); if (converted_text == NULL) { return FALSE; } else { setptr(buffer->data, converted_text); buffer->len = strlen(converted_text); } } enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8); buffer->enc = g_strdup(forced_enc); return TRUE; }
static PyObject * Encodings_convert_to_utf8_from_charset(PyObject *module, PyObject *args, PyObject *kwargs) { gchar *buffer = NULL, *charset = NULL, *new_buffer = NULL; gssize size = -1; gint fast = 0; PyObject *result; static gchar *kwlist[] = { "buffer", "size", "charset", "fast", NULL }; if (PyArg_ParseTupleAndKeywords(args, kwargs, "ss|li", kwlist, &buffer, &charset, &size, &fast)) { new_buffer = encodings_convert_to_utf8_from_charset(buffer, size, charset, fast); if (new_buffer != NULL) { result = Py_BuildValue("s", new_buffer); g_free(new_buffer); return result; } } Py_RETURN_NONE; }
/* detect encoding and convert to UTF-8 if necessary */ static gboolean handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) { g_return_val_if_fail(buffer->enc == NULL, FALSE); g_return_val_if_fail(buffer->bom == FALSE, FALSE); if (buffer->size == 0) { /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty * e.g. UTF32 file with a BOM(so size is 4, len is 0) */ buffer->enc = g_strdup("UTF-8"); } else { /* first check for a BOM */ if (enc_idx != GEANY_ENCODING_NONE) { buffer->enc = g_strdup(encodings[enc_idx].charset); buffer->bom = TRUE; if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */ { gchar *converted_text = encodings_convert_to_utf8_from_charset( buffer->data, buffer->size, buffer->enc, FALSE); if (converted_text != NULL) { setptr(buffer->data, converted_text); buffer->len = strlen(converted_text); } else { /* there was a problem converting data from BOM encoding type */ setptr(buffer->enc, NULL); buffer->bom = FALSE; } } } if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */ { /* first try to read the encoding from the file content */ gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size); /* try UTF-8 first */ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 && (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL)) { buffer->enc = g_strdup("UTF-8"); } else { /* detect the encoding */ gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data, buffer->size, regex_charset, &buffer->enc); if (converted_text == NULL) { g_free(regex_charset); return FALSE; } setptr(buffer->data, converted_text); buffer->len = strlen(converted_text); } g_free(regex_charset); } } return TRUE; }
static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size, const gchar *suggested_charset, gchar **used_encoding) { const gchar *locale_charset = NULL; const gchar *charset; gchar *utf8_content; gboolean check_suggestion = suggested_charset != NULL; gboolean check_locale = FALSE; gint i, preferred_charset; if ((gint)size == -1) { size = strlen(buffer); } /* current locale is not UTF-8, we have to check this charset */ check_locale = ! g_get_charset(&locale_charset); /* First check for preferred charset, if specified */ preferred_charset = file_prefs.default_open_encoding; if (preferred_charset == encodings[GEANY_ENCODING_NONE].idx || preferred_charset < 0 || preferred_charset >= GEANY_ENCODINGS_MAX) { preferred_charset = -1; } /* -1 means "Preferred charset" */ for (i = -1; i < GEANY_ENCODINGS_MAX; i++) { if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx)) continue; if (check_suggestion) { check_suggestion = FALSE; charset = encodings_normalize_charset(suggested_charset); if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */ charset = suggested_charset; i = -2; /* keep i below the start value to have it again at -1 on the next loop run */ } else if (check_locale) { check_locale = FALSE; charset = locale_charset; i = -2; /* keep i below the start value to have it again at -1 on the next loop run */ } else if (i == -1) { if (preferred_charset >= 0) { charset = encodings[preferred_charset].charset; geany_debug("Using preferred charset: %s", charset); } else continue; } else if (i >= 0) charset = encodings[i].charset; else /* in this case we have i == -2, continue to increase i and go ahead */ continue; if (G_UNLIKELY(charset == NULL)) continue; geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.", size, charset); utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE); if (G_LIKELY(utf8_content != NULL)) { if (used_encoding != NULL) { if (G_UNLIKELY(*used_encoding != NULL)) { geany_debug("%s:%d", __FILE__, __LINE__); g_free(*used_encoding); } *used_encoding = g_strdup(charset); } return utf8_content; } } return NULL; }