Example #1
0
const gchar *detect_charset(const gchar *text)
{
	guint8 c = *text;
	const gchar *charset = NULL;
	
	if (g_utf8_validate(text, -1, NULL)) {
		while ((c = *text++) != '\0') {
			if (c > 0x7F) {
				charset = "UTF-8";
				break;
			}
			if (c == 0x1B) /* ESC */ {
				c = *text++;
				if (c == '$') {
					c = *text++;
					switch (c) {
					case 'B': // JIS X 0208-1983
					case '@': // JIS X 0208-1978
						charset = "ISO-2022-JP";
						continue;
					case 'A': // GB2312-1980
						charset = "ISO-2022-JP-2";
						break;
					case '(':
						c = *text++;
						switch (c) {
						case 'C': // KSC5601-1987
						case 'D': // JIS X 0212-1990
							charset = "ISO-2022-JP-2";
						}
						break;
					case ')':
						c = *text++;
						if (c == 'C')
							charset = "ISO-2022-KR"; // KSC5601-1987
					}
					break;
				}
			}
		}
		if (!charset)
			charset = get_default_charset();
	}
	
	if (!charset) {
		switch (get_encoding_code()) {
		case LATINC:
		case LATINC_UA:
		case LATINC_TJ:
			charset = detect_charset_cylillic(text); // fuzzy...
			break;
		case CHINESE_CN:
		case CHINESE_TW:
		case CHINESE_HK:
			charset = detect_charset_chinese(text);
			break;
		case JAPANESE:
			charset = detect_charset_japanese(text);
			break;
		case KOREAN:
			charset = detect_charset_korean(text);
			break;
		case VIETNAMESE:
		case THAI:
		case GEORGIAN:
			charset = get_encoding_items(get_encoding_code())->item[OPENI18N];
			break;
		default:
			if (strcmp(get_default_charset(), "UTF-8") != 0)
				charset = get_default_charset();
			else if (detect_noniso(text))
				charset = get_encoding_items(get_encoding_code())->item[CODEPAGE];
			else
				charset = get_encoding_items(get_encoding_code())->item[OPENI18N];
			if (!charset)
				charset = get_encoding_items(get_encoding_code())->item[IANA];					
		}
	}
	
	return charset;
}
Example #2
0
const QString detectCharset (const QByteArray byteArray)
{
    const char* text = byteArray.constData();
    uint8_t c = *text;
    std::string charset = "";

    if (validateUTF8 (byteArray))
    {
        while ((c = *text++) != '\0')
        {
            if (c > 0x7F)
            {
                charset = "UTF-8";
                break;
            }
            if (c == 0x1B) /* ESC */
            {
                c = *text++;
                if (c == '$')
                {
                    c = *text++;
                    switch (c)
                    {
                    case 'B': // JIS X 0208-1983
                    case '@': // JIS X 0208-1978
                        charset = "ISO-2022-JP";
                        continue;
                    case 'A': // GB2312-1980
                        charset = "ISO-2022-JP-2";
                        break;
                    case '(':
                        c = *text++;
                        switch (c)
                        {
                        case 'C': // KSC5601-1987
                        case 'D': // JIS X 0212-1990
                            charset = "ISO-2022-JP-2";
                        }
                        break;
                    case ')':
                        c = *text++;
                        if (c == 'C')
                            charset = "ISO-2022-KR"; // KSC5601-1987
                    }
                    break;
                }
            }
        }
        if (charset.empty())
            charset = getDefaultCharset();
    }

    if (charset.empty())
    {
        switch (localeNum)
        {
            case LATIN1:
                /* Windows-1252 */
                charset = detectCharsetLatin (text);
                break;
            case LATINC:
            case LATINC_UA:
            case LATINC_TJ:
                /* Cyrillic */
                charset = detectCharsetCyrillic (text);
                break;
            case LATINA:
                /* MS Windows Arabic */
                charset = detectCharsetWinArabic (text);
                break;
            case CHINESE_CN:
            case CHINESE_TW:
            case CHINESE_HK:
                charset = detectCharsetChinese (text);
                break;
            case JAPANESE:
                charset = detectCharsetJapanese (text);
                break;
            case KOREAN:
                charset = detectCharsetKorean (text);
                break;
            case VIETNAMESE:
            case THAI:
            case GEORGIAN:
                charset = encodingItem[OPENI18N];
                break;
            default:
                if (getDefaultCharset() != "UTF-8")
                    charset = getDefaultCharset();
                else if (detect_noniso (text))
                    charset = encodingItem[CODEPAGE];
                else
                    charset = encodingItem[OPENI18N];
                if (charset.empty())
                    charset = encodingItem[IANA];
        }
    }

    return QString::fromStdString (charset);
}