static const char *libguess_guess(bstr buf, const char *language) { if (libguess_validate_utf8(buf.start, buf.len)) return "UTF-8"; if (!language || !language[0] || strcmp(language, "help") == 0) { mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: " "japanese taiwanese chinese korean russian arabic turkish " "greek hebrew polish baltic\n"); return NULL; } return libguess_determine_encoding(buf.start, buf.len, language); }
static char * cd_str_to_utf8 (const char * str) { char *out_str; if (str == NULL) return NULL; /* Note: Currently, playlist calls this function repeatedly, even * if the string is already converted into utf-8. * chardet_to_utf8() would convert a valid utf-8 string into a * different utf-8 string, if fallback encodings were supplied and * the given string could be treated as a string in one of * fallback encodings. To avoid this, g_utf8_validate() had been * used at the top of evaluation. */ /* Note 2: g_utf8_validate() has so called encapsulated utf-8 * problem, thus chardet_to_utf8() took the place of that. */ /* Note 3: As introducing madplug, the problem of conversion from * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert() * located near the end of chardet_to_utf8(), but it requires utf8 * validation guard where g_utf8_validate() was. New * dfa_validate_utf8() employs libguess' DFA engine to validate * utf-8 and can properly distinguish examples of encapsulated * utf-8. It is considered to be safe to use as a guard. */ /* Already UTF-8? */ #ifdef USE_CHARDET if (libguess_validate_utf8(str, strlen(str))) return g_strdup(str); #else if (g_utf8_validate(str, strlen(str), NULL)) return g_strdup(str); #endif /* chardet encoding detector */ if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL))) return out_str; /* all else fails, we mask off character codes >= 128, replace with '?' */ return str_to_utf8_fallback(str); }
static char * cd_chardet_to_utf8 (const char * str, int len, int * arg_bytes_read, int * arg_bytes_write) { char *ret = NULL; int * bytes_read, * bytes_write; int my_bytes_read, my_bytes_write; bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read; bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write; g_return_val_if_fail(str != NULL, NULL); #ifdef USE_CHARDET if (libguess_validate_utf8(str, len)) #else if (g_utf8_validate(str, len, NULL)) #endif { if (len < 0) len = strlen (str); ret = g_malloc (len + 1); memcpy (ret, str, len); ret[len] = 0; if (arg_bytes_read != NULL) * arg_bytes_read = len; if (arg_bytes_write != NULL) * arg_bytes_write = len; return ret; } #ifdef USE_CHARDET char * det = get_string (NULL, "chardet_detector"); if (det[0]) { AUDDBG("guess encoding (%s) %s\n", det, str); const char * encoding = libguess_determine_encoding (str, len, det); AUDDBG("encoding = %s\n", encoding); if (encoding) { gsize read_gsize = 0, written_gsize = 0; ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; } } g_free (det); #endif /* If detection failed or was not enabled, try fallbacks (if there are any) */ if (! ret) { char * fallbacks = get_string (NULL, "chardet_fallback"); char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1); for (char * * enc = split; * enc; enc ++) { gsize read_gsize = 0, written_gsize = 0; ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; if (len == *bytes_read) break; else { g_free(ret); ret = NULL; } } g_strfreev (split); g_free (fallbacks); } /* First fallback: locale (duh!) */ if (ret == NULL) { gsize read_gsize = 0, written_gsize = 0; ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; } /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */ if (ret == NULL) { gsize read_gsize = 0, written_gsize = 0; ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; } if (ret != NULL) { if (g_utf8_validate(ret, -1, NULL)) return ret; else { g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret); g_free(ret); return NULL; } } return NULL; /* If we have no idea, return NULL. */ }