// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; // Do our own UTF-8 detection, because at least ENCA seems to get it // wrong sometimes (suggested by divVerent). int r = bstr_validate_utf8(buf); if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) return "UTF-8"; bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); #endif #if HAVE_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated } if (res) { mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; return res; }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. const char *mp_charset_guess(bstr buf, const char *user_cp) { if (!mp_charset_requires_guess(user_cp)) return user_cp; bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; #ifdef CONFIG_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(buf, lang); #endif #ifdef CONFIG_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(buf, lang); #endif if (res) { mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_msg(MSGT_SUBREADER, MSGL_DBG2, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "no conversion"); } return res; }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. // The return value may (but doesn't have to) be allocated under talloc_ctx. const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; bool use_auto = strcasecmp(user_cp, "auto") == 0; if (use_auto) { #if HAVE_UCHARDET user_cp = "uchardet"; #elif HAVE_ENCA user_cp = "enca"; #else user_cp = "UTF-8:UTF-8-BROKEN"; #endif } bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; if (use_auto) { res = ms_bom_guess(buf); if (res) type = bstr0("auto"); } #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); #endif #if HAVE_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif #if HAVE_UCHARDET if (bstrcasecmp0(type, "uchardet") == 0) res = mp_uchardet(talloc_ctx, log, buf); #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated int r = bstr_validate_utf8(buf); if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) res = "utf-8"; } if (res) { mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; mp_verbose(log, "Using charset '%s'.\n", res); return res; }