// Returns true if user_cp implies that calling mp_charset_guess() on the // input data is required to determine the real codepage. This is the case // if user_cp is not a real iconv codepage, but a magic value that requests // for example ENCA charset auto-detection. bool mp_charset_requires_guess(const char *user_cp) { bstr res[2] = {{0}}; split_colon(user_cp, 2, res); return bstrcasecmp0(res[0], "enca") == 0 || bstrcasecmp0(res[0], "guess") == 0; }
// Returns true if user_cp implies that calling mp_charset_guess() on the // input data is required to determine the real codepage. This is the case // if user_cp is not a real iconv codepage, but a magic value that requests // for example ENCA charset auto-detection. bool mp_charset_requires_guess(const char *user_cp) { bstr res[2] = {{0}}; int r = split_colon(user_cp, 2, res); // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 // by default, plus a codepage that is used if the input is not UTF-8. return bstrcasecmp0(res[0], "enca") == 0 || bstrcasecmp0(res[0], "guess") == 0 || (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; // Do our own UTF-8 detection, because at least ENCA seems to get it // wrong sometimes (suggested by divVerent). int r = bstr_validate_utf8(buf); if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) return "UTF-8"; bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); #endif #if HAVE_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated } if (res) { mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; return res; }
int af_str2fmt_short(bstr str) { for (int i = 0; af_fmtstr_table[i].name; i++) { if (!bstrcasecmp0(str, af_fmtstr_table[i].name)) return af_fmtstr_table[i].format; } return 0; }
char *mp_tags_get_bstr(struct mp_tags *tags, bstr key) { for (int n = 0; n < tags->num_keys; n++) { if (bstrcasecmp0(key, tags->keys[n]) == 0) return tags->values[n]; } return NULL; }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. const char *mp_charset_guess(bstr buf, const char *user_cp) { if (!mp_charset_requires_guess(user_cp)) return user_cp; bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; #ifdef CONFIG_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(buf, lang); #endif #ifdef CONFIG_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(buf, lang); #endif if (res) { mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_msg(MSGT_SUBREADER, MSGL_DBG2, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "no conversion"); } return res; }
int af_str2fmt_short(bstr str) { if (bstr_startswith0(str, "0x")) { bstr rest; int fmt = bstrtoll(str, &rest, 16); if (rest.len == 0 && af_fmt_valid(fmt)) return fmt; } for (int i = 0; af_fmtstr_table[i].name; i++) if (!bstrcasecmp0(str, af_fmtstr_table[i].name)) return af_fmtstr_table[i].format; return -1; }
void mp_tags_set_bstr(struct mp_tags *tags, bstr key, bstr value) { for (int n = 0; n < tags->num_keys; n++) { if (bstrcasecmp0(key, tags->keys[n]) == 0) { talloc_free(tags->values[n]); tags->values[n] = bstrto0(tags, value); return; } } MP_RESIZE_ARRAY(tags, tags->keys, tags->num_keys + 1); MP_RESIZE_ARRAY(tags, tags->values, tags->num_keys + 1); tags->keys[tags->num_keys] = bstrto0(tags, key); tags->values[tags->num_keys] = bstrto0(tags, value); tags->num_keys++; }
static int parse_pls(struct pl_parser *p) { bstr line = {0}; while (!line.len && !pl_eof(p)) line = bstr_strip(pl_get_line(p)); if (bstrcasecmp0(line, "[playlist]") != 0) return -1; if (p->probing) return 0; while (!pl_eof(p)) { line = bstr_strip(pl_get_line(p)); bstr key, value; if (bstr_split_tok(line, "=", &key, &value) && bstr_case_startswith(key, bstr0("File"))) { pl_add(p, value); } } return 0; }
// Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. // The return value may (but doesn't have to) be allocated under talloc_ctx. const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; bool use_auto = strcasecmp(user_cp, "auto") == 0; if (use_auto) { #if HAVE_UCHARDET user_cp = "uchardet"; #elif HAVE_ENCA user_cp = "enca"; #else user_cp = "UTF-8:UTF-8-BROKEN"; #endif } bstr params[3] = {{0}}; split_colon(user_cp, 3, params); bstr type = params[0]; char lang[100]; snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); const char *fallback = params[2].start; // last item, already 0-terminated const char *res = NULL; if (use_auto) { res = ms_bom_guess(buf); if (res) type = bstr0("auto"); } #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); #endif #if HAVE_LIBGUESS if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif #if HAVE_UCHARDET if (bstrcasecmp0(type, "uchardet") == 0) res = mp_uchardet(talloc_ctx, log, buf); #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated int r = bstr_validate_utf8(buf); if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) res = "utf-8"; } if (res) { mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; mp_verbose(log, "Using charset '%s'.\n", res); return res; }