void chardet_init (void) { #ifdef USE_CHARDET libguess_determine_encoding(NULL, -1, ""); #endif str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8); }
static rb_encoding* _guess_encoding(VALUE str) { rb_encoding *enc; const char *encname; encname = libguess_determine_encoding(RSTRING_PTR(str), RSTRING_LEN(str), RSTRING_PTR(guess4r_enc__guess_region(rb_cEncoding))); return rb_enc_find(encname); }
static const char *libguess_guess(bstr buf, const char *language) { if (!language || !language[0] || strcmp(language, "help") == 0) { mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: " "japanese taiwanese chinese korean russian arabic turkish " "greek hebrew polish baltic\n"); return NULL; } return libguess_determine_encoding(buf.start, buf.len, language); }
static char * cd_chardet_to_utf8 (const char * str, int len, int * arg_bytes_read, int * arg_bytes_write) { char *ret = NULL; int * bytes_read, * bytes_write; int my_bytes_read, my_bytes_write; bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read; bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write; g_return_val_if_fail(str != NULL, NULL); #ifdef USE_CHARDET if (libguess_validate_utf8(str, len)) #else if (g_utf8_validate(str, len, NULL)) #endif { if (len < 0) len = strlen (str); ret = g_malloc (len + 1); memcpy (ret, str, len); ret[len] = 0; if (arg_bytes_read != NULL) * arg_bytes_read = len; if (arg_bytes_write != NULL) * arg_bytes_write = len; return ret; } #ifdef USE_CHARDET char * det = get_string (NULL, "chardet_detector"); if (det[0]) { AUDDBG("guess encoding (%s) %s\n", det, str); const char * encoding = libguess_determine_encoding (str, len, det); AUDDBG("encoding = %s\n", encoding); if (encoding) { gsize read_gsize = 0, written_gsize = 0; ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; } } g_free (det); #endif /* If detection failed or was not enabled, try fallbacks (if there are any) */ if (! ret) { char * fallbacks = get_string (NULL, "chardet_fallback"); char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1); for (char * * enc = split; * enc; enc ++) { gsize read_gsize = 0, written_gsize = 0; ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; if (len == *bytes_read) break; else { g_free(ret); ret = NULL; } } g_strfreev (split); g_free (fallbacks); } /* First fallback: locale (duh!) */ if (ret == NULL) { gsize read_gsize = 0, written_gsize = 0; ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; } /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */ if (ret == NULL) { gsize read_gsize = 0, written_gsize = 0; ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL); * bytes_read = read_gsize; * bytes_write = written_gsize; } if (ret != NULL) { if (g_utf8_validate(ret, -1, NULL)) return ret; else { g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret); g_free(ret); return NULL; } } return NULL; /* If we have no idea, return NULL. */ }