// Iterate entries. The first call establishes the first entry. Returns false // if no entry found, otherwise returns true and sets mpa->entry/entry_filename. bool mp_archive_next_entry(struct mp_archive *mpa) { mpa->entry = NULL; talloc_free(mpa->entry_filename); mpa->entry_filename = NULL; while (!mp_cancel_test(mpa->primary_src->cancel)) { struct archive_entry *entry; int r = archive_read_next_header(mpa->arch, &entry); if (r == ARCHIVE_EOF) break; if (r < ARCHIVE_OK) MP_ERR(mpa, "%s\n", archive_error_string(mpa->arch)); if (r < ARCHIVE_WARN) { MP_FATAL(mpa, "could not read archive entry\n"); break; } if (archive_entry_filetype(entry) != AE_IFREG) continue; // Some archives may have no filenames, or libarchive won't return some. const char *fn = archive_entry_pathname(entry); char buf[64]; if (!fn || bstr_validate_utf8(bstr0(fn)) < 0) { snprintf(buf, sizeof(buf), "mpv_unknown#%d", mpa->entry_num); fn = buf; } mpa->entry = entry; mpa->entry_filename = talloc_strdup(mpa, fn); mpa->entry_num += 1; return true; } return false; }
static const char *enca_guess(struct mp_log *log, bstr buf, const char *language) { // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes // (suggested by divVerent). Explicitly allow cut-off UTF-8. if (bstr_validate_utf8(buf) > -8) return "UTF-8"; if (!language || !language[0]) language = "__"; // neutral language const char *detected_cp = NULL; EncaAnalyser analyser = enca_analyser_alloc(language); if (analyser) { enca_set_termination_strictness(analyser, 0); EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len); const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV); if (tmp && enc.charset != ENCA_CS_UNKNOWN) detected_cp = tmp; enca_analyser_free(analyser); } else { mp_err(log, "ENCA doesn't know language '%s'\n", language); size_t langcnt; const char **languages = enca_get_languages(&langcnt); mp_err(log, "ENCA supported languages:"); for (int i = 0; i < langcnt; i++) mp_err(log, " %s", languages[i]); mp_err(log, "\n"); free(languages); } return detected_cp; }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; // Do our own UTF-8 detection, because at least ENCA seems to get it // wrong sometimes (suggested by divVerent). int r = bstr_validate_utf8(buf); if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) return "UTF-8"; bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); #endif #if HAVE_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated } if (res) { mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; return res; }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. // The return value may (but doesn't have to) be allocated under talloc_ctx. const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; bool use_auto = strcasecmp(user_cp, "auto") == 0; if (use_auto) { #if HAVE_UCHARDET user_cp = "uchardet"; #elif HAVE_ENCA user_cp = "enca"; #else user_cp = "UTF-8:UTF-8-BROKEN"; #endif } bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; if (use_auto) { res = ms_bom_guess(buf); if (res) type = bstr0("auto"); } #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); #endif #if HAVE_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif #if HAVE_UCHARDET if (bstrcasecmp0(type, "uchardet") == 0) res = mp_uchardet(talloc_ctx, log, buf); #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated int r = bstr_validate_utf8(buf); if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) res = "utf-8"; } if (res) { mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; mp_verbose(log, "Using charset '%s'.\n", res); return res; }