Пример #1
0
static const char *libguess_guess(bstr buf, const char *language)
{
    if (libguess_validate_utf8(buf.start, buf.len))
        return "UTF-8";

    if (!language || !language[0] || strcmp(language, "help") == 0) {
        mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: "
               "japanese taiwanese chinese korean russian arabic turkish "
               "greek hebrew polish baltic\n");
        return NULL;
    }

    return libguess_determine_encoding(buf.start, buf.len, language);
}
Пример #2
0
static char * cd_str_to_utf8 (const char * str)
{
    char *out_str;

    if (str == NULL)
        return NULL;

    /* Note: Currently, playlist calls this function repeatedly, even
     * if the string is already converted into utf-8.
     * chardet_to_utf8() would convert a valid utf-8 string into a
     * different utf-8 string, if fallback encodings were supplied and
     * the given string could be treated as a string in one of
     * fallback encodings. To avoid this, g_utf8_validate() had been
     * used at the top of evaluation.
     */

    /* Note 2: g_utf8_validate() has so called encapsulated utf-8
     * problem, thus chardet_to_utf8() took the place of that.
     */

    /* Note 3: As introducing madplug, the problem of conversion from
     * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
     * located near the end of chardet_to_utf8(), but it requires utf8
     * validation guard where g_utf8_validate() was. New
     * dfa_validate_utf8() employs libguess' DFA engine to validate
     * utf-8 and can properly distinguish examples of encapsulated
     * utf-8. It is considered to be safe to use as a guard.
     */

    /* Already UTF-8? */
#ifdef USE_CHARDET
    if (libguess_validate_utf8(str, strlen(str)))
        return g_strdup(str);
#else
    if (g_utf8_validate(str, strlen(str), NULL))
        return g_strdup(str);
#endif

    /* chardet encoding detector */
    if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL)))
        return out_str;

    /* all else fails, we mask off character codes >= 128, replace with '?' */
    return str_to_utf8_fallback(str);
}
Пример #3
0
static char * cd_chardet_to_utf8 (const char * str, int len,
 int * arg_bytes_read, int * arg_bytes_write)
{
    char *ret = NULL;
    int * bytes_read, * bytes_write;
    int my_bytes_read, my_bytes_write;

    bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
    bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;

    g_return_val_if_fail(str != NULL, NULL);

#ifdef USE_CHARDET
    if (libguess_validate_utf8(str, len))
#else
    if (g_utf8_validate(str, len, NULL))
#endif
    {
        if (len < 0)
            len = strlen (str);

        ret = g_malloc (len + 1);
        memcpy (ret, str, len);
        ret[len] = 0;

        if (arg_bytes_read != NULL)
            * arg_bytes_read = len;
        if (arg_bytes_write != NULL)
            * arg_bytes_write = len;

        return ret;
    }

#ifdef USE_CHARDET
    char * det = get_string (NULL, "chardet_detector");

    if (det[0])
    {
        AUDDBG("guess encoding (%s) %s\n", det, str);
        const char * encoding = libguess_determine_encoding (str, len, det);
        AUDDBG("encoding = %s\n", encoding);
        if (encoding)
        {
            gsize read_gsize = 0, written_gsize = 0;
            ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL);
            * bytes_read = read_gsize;
            * bytes_write = written_gsize;
        }
    }

    g_free (det);
#endif

    /* If detection failed or was not enabled, try fallbacks (if there are any) */
    if (! ret)
    {
        char * fallbacks = get_string (NULL, "chardet_fallback");
        char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);

        for (char * * enc = split; * enc; enc ++)
        {
            gsize read_gsize = 0, written_gsize = 0;
            ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL);
            * bytes_read = read_gsize;
            * bytes_write = written_gsize;

            if (len == *bytes_read)
                break;
            else {
                g_free(ret);
                ret = NULL;
            }
        }

        g_strfreev (split);
        g_free (fallbacks);
    }

    /* First fallback: locale (duh!) */
    if (ret == NULL)
    {
        gsize read_gsize = 0, written_gsize = 0;
        ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL);
        * bytes_read = read_gsize;
        * bytes_write = written_gsize;
    }

    /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
    if (ret == NULL)
    {
        gsize read_gsize = 0, written_gsize = 0;
        ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL);
        * bytes_read = read_gsize;
        * bytes_write = written_gsize;
    }

    if (ret != NULL)
    {
        if (g_utf8_validate(ret, -1, NULL))
            return ret;
        else
        {
            g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
            g_free(ret);
            return NULL;
        }
    }

    return NULL; /* If we have no idea, return NULL. */
}