Exemple #1
0
// Iterate entries. The first call establishes the first entry. Returns false
// if no entry found, otherwise returns true and sets mpa->entry/entry_filename.
bool mp_archive_next_entry(struct mp_archive *mpa)
{
    mpa->entry = NULL;
    talloc_free(mpa->entry_filename);
    mpa->entry_filename = NULL;

    while (!mp_cancel_test(mpa->primary_src->cancel)) {
        struct archive_entry *entry;
        int r = archive_read_next_header(mpa->arch, &entry);
        if (r == ARCHIVE_EOF)
            break;
        if (r < ARCHIVE_OK)
            MP_ERR(mpa, "%s\n", archive_error_string(mpa->arch));
        if (r < ARCHIVE_WARN) {
            MP_FATAL(mpa, "could not read archive entry\n");
            break;
        }
        if (archive_entry_filetype(entry) != AE_IFREG)
            continue;
        // Some archives may have no filenames, or libarchive won't return some.
        const char *fn = archive_entry_pathname(entry);
        char buf[64];
        if (!fn || bstr_validate_utf8(bstr0(fn)) < 0) {
            snprintf(buf, sizeof(buf), "mpv_unknown#%d", mpa->entry_num);
            fn = buf;
        }
        mpa->entry = entry;
        mpa->entry_filename = talloc_strdup(mpa, fn);
        mpa->entry_num += 1;
        return true;
    }

    return false;
}
Exemple #2
0
static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
{
    // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes
    // (suggested by divVerent). Explicitly allow cut-off UTF-8.
    if (bstr_validate_utf8(buf) > -8)
        return "UTF-8";

    if (!language || !language[0])
        language = "__"; // neutral language

    const char *detected_cp = NULL;

    EncaAnalyser analyser = enca_analyser_alloc(language);
    if (analyser) {
        enca_set_termination_strictness(analyser, 0);
        EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
        const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
        if (tmp && enc.charset != ENCA_CS_UNKNOWN)
            detected_cp = tmp;
        enca_analyser_free(analyser);
    } else {
        mp_err(log, "ENCA doesn't know language '%s'\n", language);
        size_t langcnt;
        const char **languages = enca_get_languages(&langcnt);
        mp_err(log, "ENCA supported languages:");
        for (int i = 0; i < langcnt; i++)
            mp_err(log, " %s", languages[i]);
        mp_err(log, "\n");
        free(languages);
    }

    return detected_cp;
}
Exemple #3
0
// Runs charset auto-detection on the input buffer, and returns the result.
// If auto-detection fails, NULL is returned.
// If user_cp doesn't refer to any known auto-detection (for example because
// it's a real iconv codepage), user_cp is returned without even looking at
// the buf data.
const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
                             int flags)
{
    if (!mp_charset_requires_guess(user_cp))
        return user_cp;

    // Do our own UTF-8 detection, because at least ENCA seems to get it
    // wrong sometimes (suggested by divVerent).
    int r = bstr_validate_utf8(buf);
    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
        return "UTF-8";

    bstr params[3] = {{0}};
    split_colon(user_cp, 3, params);

    bstr type = params[0];
    char lang[100];
    snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
    const char *fallback = params[2].start; // last item, already 0-terminated

    const char *res = NULL;

#if HAVE_ENCA
    if (bstrcasecmp0(type, "enca") == 0)
        res = enca_guess(log, buf, lang);
#endif
#if HAVE_LIBGUESS
    if (bstrcasecmp0(type, "guess") == 0)
        res = libguess_guess(log, buf, lang);
#endif
    if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
        if (!fallback)
            fallback = params[1].start; // must be already 0-terminated
    }

    if (res) {
        mp_dbg(log, "%.*s detected charset: '%s'\n",
               BSTR_P(type), res);
    } else {
        res = fallback;
        mp_dbg(log, "Detection with %.*s failed: fallback to %s\n",
               BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1");
    }

    if (!res && !(flags & MP_STRICT_UTF8))
        res = "UTF-8-BROKEN";

    return res;
}
Exemple #4
0
// Runs charset auto-detection on the input buffer, and returns the result.
// If auto-detection fails, NULL is returned.
// If user_cp doesn't refer to any known auto-detection (for example because
// it's a real iconv codepage), user_cp is returned without even looking at
// the buf data.
// The return value may (but doesn't have to) be allocated under talloc_ctx.
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
                             const char *user_cp, int flags)
{
    if (!mp_charset_requires_guess(user_cp))
        return user_cp;

    bool use_auto = strcasecmp(user_cp, "auto") == 0;
    if (use_auto) {
#if HAVE_UCHARDET
        user_cp = "uchardet";
#elif HAVE_ENCA
        user_cp = "enca";
#else
        user_cp = "UTF-8:UTF-8-BROKEN";
#endif
    }

    bstr params[3] = {{0}};
    split_colon(user_cp, 3, params);

    bstr type = params[0];
    char lang[100];
    snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
    const char *fallback = params[2].start; // last item, already 0-terminated

    const char *res = NULL;

    if (use_auto) {
        res = ms_bom_guess(buf);
        if (res)
            type = bstr0("auto");
    }

#if HAVE_ENCA
    if (bstrcasecmp0(type, "enca") == 0)
        res = enca_guess(log, buf, lang);
#endif
#if HAVE_LIBGUESS
    if (bstrcasecmp0(type, "guess") == 0)
        res = libguess_guess(log, buf, lang);
#endif
#if HAVE_UCHARDET
    if (bstrcasecmp0(type, "uchardet") == 0)
        res = mp_uchardet(talloc_ctx, log, buf);
#endif

    if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
        if (!fallback)
            fallback = params[1].start; // must be already 0-terminated
        int r = bstr_validate_utf8(buf);
        if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
            res = "utf-8";
    }

    if (res) {
        mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res);
    } else {
        res = fallback;
        mp_dbg(log, "Detection with %.*s failed: fallback to %s\n",
               BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1");
    }

    if (!res && !(flags & MP_STRICT_UTF8))
        res = "UTF-8-BROKEN";

    mp_verbose(log, "Using charset '%s'.\n", res);
    return res;
}