const gchar *detect_charset(const gchar *text) { guint8 c = *text; const gchar *charset = NULL; if (g_utf8_validate(text, -1, NULL)) { while ((c = *text++) != '\0') { if (c > 0x7F) { charset = "UTF-8"; break; } if (c == 0x1B) /* ESC */ { c = *text++; if (c == '$') { c = *text++; switch (c) { case 'B': // JIS X 0208-1983 case '@': // JIS X 0208-1978 charset = "ISO-2022-JP"; continue; case 'A': // GB2312-1980 charset = "ISO-2022-JP-2"; break; case '(': c = *text++; switch (c) { case 'C': // KSC5601-1987 case 'D': // JIS X 0212-1990 charset = "ISO-2022-JP-2"; } break; case ')': c = *text++; if (c == 'C') charset = "ISO-2022-KR"; // KSC5601-1987 } break; } } } if (!charset) charset = get_default_charset(); } if (!charset) { switch (get_encoding_code()) { case LATINC: case LATINC_UA: case LATINC_TJ: charset = detect_charset_cylillic(text); // fuzzy... break; case CHINESE_CN: case CHINESE_TW: case CHINESE_HK: charset = detect_charset_chinese(text); break; case JAPANESE: charset = detect_charset_japanese(text); break; case KOREAN: charset = detect_charset_korean(text); break; case VIETNAMESE: case THAI: case GEORGIAN: charset = get_encoding_items(get_encoding_code())->item[OPENI18N]; break; default: if (strcmp(get_default_charset(), "UTF-8") != 0) charset = get_default_charset(); else if (detect_noniso(text)) charset = get_encoding_items(get_encoding_code())->item[CODEPAGE]; else charset = get_encoding_items(get_encoding_code())->item[OPENI18N]; if (!charset) charset = get_encoding_items(get_encoding_code())->item[IANA]; } } return charset; }
const QString detectCharset (const QByteArray byteArray) { const char* text = byteArray.constData(); uint8_t c = *text; std::string charset = ""; if (validateUTF8 (byteArray)) { while ((c = *text++) != '\0') { if (c > 0x7F) { charset = "UTF-8"; break; } if (c == 0x1B) /* ESC */ { c = *text++; if (c == '$') { c = *text++; switch (c) { case 'B': // JIS X 0208-1983 case '@': // JIS X 0208-1978 charset = "ISO-2022-JP"; continue; case 'A': // GB2312-1980 charset = "ISO-2022-JP-2"; break; case '(': c = *text++; switch (c) { case 'C': // KSC5601-1987 case 'D': // JIS X 0212-1990 charset = "ISO-2022-JP-2"; } break; case ')': c = *text++; if (c == 'C') charset = "ISO-2022-KR"; // KSC5601-1987 } break; } } } if (charset.empty()) charset = getDefaultCharset(); } if (charset.empty()) { switch (localeNum) { case LATIN1: /* Windows-1252 */ charset = detectCharsetLatin (text); break; case LATINC: case LATINC_UA: case LATINC_TJ: /* Cyrillic */ charset = detectCharsetCyrillic (text); break; case LATINA: /* MS Windows Arabic */ charset = detectCharsetWinArabic (text); break; case CHINESE_CN: case CHINESE_TW: case CHINESE_HK: charset = detectCharsetChinese (text); break; case JAPANESE: charset = detectCharsetJapanese (text); break; case KOREAN: charset = detectCharsetKorean (text); break; case VIETNAMESE: case THAI: case GEORGIAN: charset = encodingItem[OPENI18N]; break; default: if (getDefaultCharset() != "UTF-8") charset = getDefaultCharset(); else if (detect_noniso (text)) charset = encodingItem[CODEPAGE]; else charset = encodingItem[OPENI18N]; if (charset.empty()) charset = encodingItem[IANA]; } } return QString::fromStdString (charset); }