bool EncodingDetector::DetectEncoding(const QByteArray& buffer) { if (buffer.isEmpty()) return false; if (buffer.size() >= 4) { // BOM is max 4 bytes char buff[4] = {'\0'}; memcpy(buff, buffer.constData(), 4); if (memcmp(buff, "\xEF\xBB\xBF", 3) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 3; m_Encoding = QTextCodec::codecForName("UTF-8"); } else if (memcmp(buff, "\x00\x00\xFE\xFF", 4) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 4; m_Encoding = QTextCodec::codecForName("UTF-32BE"); } else if (memcmp(buff, "\xFF\xFE\x00\x00", 4) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 4; m_Encoding = QTextCodec::codecForName("UTF-32LE"); } else if (memcmp(buff, "\xFE\xFF", 2) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 2; m_Encoding = QTextCodec::codecForName("UTF-16BE"); } else if (memcmp(buff, "\xFF\xFE", 2) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 2; m_Encoding = QTextCodec::codecForName("UTF-16LE"); } } if (!m_UseBOM) { if (DetectUTF8(buffer)) { m_Encoding = QTextCodec::codecForName("UTF-8"); } else if (!DetectUTF16(buffer) && !DetectUTF32(buffer)) { // Use default encoding m_Encoding = ApplicationManager::settings()->defaultEncoding(); } m_UseBOM = false; m_BOMSizeInBytes = 0; } return true; } // end of DetectEncoding
bool EncodingDetector::DetectEncoding(const wxByte* buffer, size_t size, bool ConvertToWxString) { ConfigManager* cfgMgr = Manager::Get()->GetConfigManager(_T("editor")); wxString encname = cfgMgr->Read(_T("/default_encoding")); if (cfgMgr->ReadInt(_T("/default_encoding/use_option"), 0) == 1) { // Bypass C::B's auto-detection m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false); if(m_UseLog) { wxString msg; msg.Printf(_T("Warning: bypassing E::B's auto-detection!\n" "Encoding requested is: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } else { if (!buffer) return false; if (size >= 4) { // BOM is max 4 bytes char buff[4] = {'\0'}; memcpy(buff, buffer, 4); if (memcmp(buff, "\xEF\xBB\xBF", 3) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 3; m_Encoding = wxFONTENCODING_UTF8; } else if (memcmp(buff, "\x00\x00\xFE\xFF", 4) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 4; m_Encoding = wxFONTENCODING_UTF32BE; } else if (memcmp(buff, "\x00\x00\xFF\xFE", 4) == 0) { // 00 00 FF FE UCS-4, unusual octet order BOM (2143) // X-ISO-10646-UCS-4-2143 can not (yet) be handled by wxWidgets m_Encoding = (wxFontEncoding)-1; } else if (memcmp(buff, "\xFF\xFE\x00\x00", 4) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 4; m_Encoding = wxFONTENCODING_UTF32LE; } else if (memcmp(buff, "\xFE\xFF\x00\x00", 4) == 0) { // FE FF 00 00 UCS-4, unusual octet order BOM (3412) // X-ISO-10646-UCS-4-3412 can not (yet) be handled by wxWidgets m_Encoding = (wxFontEncoding)-1; } else if (memcmp(buff, "\xFE\xFF", 2) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 2; m_Encoding = wxFONTENCODING_UTF16BE; } else if (memcmp(buff, "\xFF\xFE", 2) == 0) { m_UseBOM = true; m_BOMSizeInBytes = 2; m_Encoding = wxFONTENCODING_UTF16LE; } } if (m_UseBOM) { if(m_UseLog) { wxString msg; msg.Printf(_T("Detected encoding via BOM: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } // Try our own detection for UTF-16 and UTF-32, the mozilla-version does not work without BOM else if ( !DetectUTF16((wxByte*)buffer, size) && !DetectUTF32((wxByte*)buffer, size) ) { // if we still have no results try mozilla's detection m_Encoding = wxFontMapper::Get()->CharsetToEncoding(DoIt((char*)buffer, size), false); if(m_Encoding == wxFONTENCODING_DEFAULT) { wxString enc_name = Manager::Get()->GetConfigManager(_T("editor"))->Read(_T("/default_encoding"), wxLocale::GetSystemEncodingName()); m_Encoding = wxFontMapper::GetEncodingFromName(enc_name); if(m_UseLog) { wxString msg; msg.Printf(_T("Text seems to be pure ASCII!\n" "We use user specified encoding: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } if (m_Encoding < 0) { // Use user-specified one; as a fallback m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false); if(m_UseLog) { wxString msg; msg.Printf(_T("Warning: Using user specified encoding as fallback!\n" "Encoding fallback is: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } m_UseBOM = false; m_BOMSizeInBytes = 0; } } if(m_UseLog) { wxString msg; msg.Printf(_T("Final encoding detected: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } if (ConvertToWxString) { ConvertToWxStr(buffer, size); } return true; } // end of DetectEncoding