bool EncodingDetector::DetectEncoding(const QByteArray& buffer)
{
    if (buffer.isEmpty())
        return false;
    if (buffer.size() >= 4)
    {
        // BOM is max 4 bytes
        char buff[4] = {'\0'};
        memcpy(buff, buffer.constData(), 4);

        if (memcmp(buff, "\xEF\xBB\xBF", 3) == 0)
        {
            m_UseBOM = true;
            m_BOMSizeInBytes = 3;
            m_Encoding = QTextCodec::codecForName("UTF-8");
        }
        else if (memcmp(buff, "\x00\x00\xFE\xFF", 4) == 0)
        {
            m_UseBOM = true;
            m_BOMSizeInBytes = 4;
            m_Encoding = QTextCodec::codecForName("UTF-32BE");
        }
        else if (memcmp(buff, "\xFF\xFE\x00\x00", 4) == 0)
        {
            m_UseBOM = true;
            m_BOMSizeInBytes = 4;
            m_Encoding = QTextCodec::codecForName("UTF-32LE");
        }
        else if (memcmp(buff, "\xFE\xFF", 2) == 0)
        {
            m_UseBOM = true;
            m_BOMSizeInBytes = 2;
            m_Encoding = QTextCodec::codecForName("UTF-16BE");
        }
        else if (memcmp(buff, "\xFF\xFE", 2) == 0)
        {
            m_UseBOM = true;
            m_BOMSizeInBytes = 2;
            m_Encoding = QTextCodec::codecForName("UTF-16LE");
        }
    }


    if (!m_UseBOM)
    {
        if (DetectUTF8(buffer))
        {
            m_Encoding = QTextCodec::codecForName("UTF-8");
        }
        else if (!DetectUTF16(buffer) && !DetectUTF32(buffer))
        {
            // Use default encoding
            m_Encoding = ApplicationManager::settings()->defaultEncoding();
        }

        m_UseBOM = false;
        m_BOMSizeInBytes = 0;
    }

    return true;
} // end of DetectEncoding
示例#2
0
bool EncodingDetector::DetectEncoding(const wxByte* buffer, size_t size, bool ConvertToWxString)
{
    ConfigManager* cfgMgr = Manager::Get()->GetConfigManager(_T("editor"));
    wxString encname = cfgMgr->Read(_T("/default_encoding"));

    if (cfgMgr->ReadInt(_T("/default_encoding/use_option"), 0) == 1)
    {
        // Bypass C::B's auto-detection
        m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false);

        if(m_UseLog)
        {
            wxString msg;
            msg.Printf(_T("Warning: bypassing E::B's auto-detection!\n"
                          "Encoding requested is: %s (ID: %d)"),
                       wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                       m_Encoding);
            Manager::Get()->GetLogManager()->DebugLog(msg);
        }
    }
    else
    {
        if (!buffer)
            return false;
        if (size >= 4)
        {
            // BOM is max 4 bytes
            char buff[4] = {'\0'};
            memcpy(buff, buffer, 4);

            if (memcmp(buff, "\xEF\xBB\xBF", 3) == 0)
            {
                m_UseBOM = true;
                m_BOMSizeInBytes = 3;
                m_Encoding = wxFONTENCODING_UTF8;
            }
            else if (memcmp(buff, "\x00\x00\xFE\xFF", 4) == 0)
            {
                m_UseBOM = true;
                m_BOMSizeInBytes = 4;
                m_Encoding = wxFONTENCODING_UTF32BE;
            }
            else if (memcmp(buff, "\x00\x00\xFF\xFE", 4) == 0)
            {
            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
            // X-ISO-10646-UCS-4-2143 can not (yet) be handled by wxWidgets
                m_Encoding = (wxFontEncoding)-1;
            }
            else if (memcmp(buff, "\xFF\xFE\x00\x00", 4) == 0)
            {
                m_UseBOM = true;
                m_BOMSizeInBytes = 4;
                m_Encoding = wxFONTENCODING_UTF32LE;
            }
            else if (memcmp(buff, "\xFE\xFF\x00\x00", 4) == 0)
            {
            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
            // X-ISO-10646-UCS-4-3412 can not (yet) be handled by wxWidgets
                m_Encoding = (wxFontEncoding)-1;
            }
            else if (memcmp(buff, "\xFE\xFF", 2) == 0)
            {
                m_UseBOM = true;
                m_BOMSizeInBytes = 2;
                m_Encoding = wxFONTENCODING_UTF16BE;
            }
            else if (memcmp(buff, "\xFF\xFE", 2) == 0)
            {
                m_UseBOM = true;
                m_BOMSizeInBytes = 2;
                m_Encoding = wxFONTENCODING_UTF16LE;
            }
        }


        if (m_UseBOM)
        {
            if(m_UseLog)
            {
                wxString msg;
                msg.Printf(_T("Detected encoding via BOM: %s (ID: %d)"),
                           wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                           m_Encoding);
                Manager::Get()->GetLogManager()->DebugLog(msg);
            }
        }
        // Try our own detection for UTF-16 and UTF-32, the mozilla-version does not work without BOM
        else if ( !DetectUTF16((wxByte*)buffer, size) &&
                  !DetectUTF32((wxByte*)buffer, size) )
            {
            // if we still have no results try mozilla's detection
            m_Encoding = wxFontMapper::Get()->CharsetToEncoding(DoIt((char*)buffer, size), false);
            if(m_Encoding == wxFONTENCODING_DEFAULT)
            {
                wxString enc_name = Manager::Get()->GetConfigManager(_T("editor"))->Read(_T("/default_encoding"), wxLocale::GetSystemEncodingName());
                m_Encoding = wxFontMapper::GetEncodingFromName(enc_name);
                if(m_UseLog)
                {
                    wxString msg;
                    msg.Printf(_T("Text seems to be pure ASCII!\n"
                                  "We use user specified encoding: %s (ID: %d)"),
                               wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                               m_Encoding);
                    Manager::Get()->GetLogManager()->DebugLog(msg);
                }
            }
            if (m_Encoding < 0)
            {
                // Use user-specified one; as a fallback
                m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false);

                if(m_UseLog)
                {
                    wxString msg;
                    msg.Printf(_T("Warning: Using user specified encoding as fallback!\n"
                                  "Encoding fallback is: %s (ID: %d)"),
                               wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                               m_Encoding);
                    Manager::Get()->GetLogManager()->DebugLog(msg);
                }
            }

            m_UseBOM = false;
            m_BOMSizeInBytes = 0;
        }
    }

    if(m_UseLog)
    {
        wxString msg;
        msg.Printf(_T("Final encoding detected: %s (ID: %d)"),
                   wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                   m_Encoding);
        Manager::Get()->GetLogManager()->DebugLog(msg);
    }

    if (ConvertToWxString)
    {
        ConvertToWxStr(buffer, size);
    }

    return true;
} // end of DetectEncoding