const char *Detector::Close(void) {
	DataEnd();

	if (!mDone) {
		if (mInputState == eEscAscii) {
			return "ibm850";
		}
		else if (mInputState == ePureAscii) {
			return "ASCII";
		}

		return NULL;
	}

	return mDetectedCharset;
}
void nsCyrillicDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
   PRUint8 cls;
   const char* b;
   PRUint32 i;
   if(mDone) 
      return;
   for(i=0, b=aBuf;i<aLen;i++,b++)
   {
     for(PRUintn j=0;j<mItems;j++)
     {
        if( 0x80 & *b)
           cls = mCyrillicClass[j][(*b) & 0x7F];
        else 
           cls = 0;
        NS_ASSERTION( cls <= 32 , "illegal character class");
        mProb[j] += gCyrillicProb[mLastCls[j]][cls];
        mLastCls[j] = cls;
     } 
   }
   // We now only based on the first block we receive
   DataEnd();
}
bool EncodingDetector::DetectEncoding(const wxByte* buffer, size_t size, bool convert_to_wxstring)
{
    ConfigManager* cfgMgr = Manager::Get()->GetConfigManager(_T("editor"));
    wxString encname = cfgMgr->Read(_T("/default_encoding"));

    if (cfgMgr->ReadInt(_T("/default_encoding/use_option"), 0) == 1)
    {
        // Bypass C::B's auto-detection
        m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false);

        if (m_UseLog)
        {
            wxString msg;
            msg.Printf(_T("Warning: bypassing C::B's auto-detection!\n"
                          "Encoding requested is: %s (ID: %d)"),
                       wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                       m_Encoding);
            Manager::Get()->GetLogManager()->DebugLog(msg);
        }
    }
    else
    {
        if (!buffer)
            return false;

        // Try our own detection for UTF-16 and UTF-32, the Mozilla-version does not work without BOM
        if ( DetectEncodingEx(buffer, size) )
        {
            if (m_UseBOM && m_UseLog)
            {
                wxString msg;
                msg.Printf(_T("Detected encoding via BOM: %s (ID: %d)"),
                           wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                           m_Encoding);
                Manager::Get()->GetLogManager()->DebugLog(msg);
            }
        }
        else
        {
            //{ MOZILLA nsUniversalDetector START
            // If we still have no results try Mozilla (taken from nsUdetXPCOMWrapper.cpp):
            Reset(); nsresult res = HandleData((char*)buffer, size);
            if (res==NS_OK)
                DataEnd();
            else
            {
                m_MozillaResult = wxEmptyString;
                if (m_UseLog)
                    Manager::Get()->GetLogManager()->DebugLog(F(_T("Mozilla universal detection failed with %d."), res));
            }
            //} MOZILLA nsUniversalDetector END

            if ( !m_MozillaResult.IsEmpty() )
                m_Encoding = wxFontMapper::Get()->CharsetToEncoding(m_MozillaResult, false);

            if (m_Encoding == wxFONTENCODING_DEFAULT)
            {
                wxString enc_name = Manager::Get()->GetConfigManager(_T("editor"))->Read(_T("/default_encoding"), wxLocale::GetSystemEncodingName());
                m_Encoding = wxFontMapper::GetEncodingFromName(enc_name);
                if (m_UseLog)
                {
                    wxString msg;
                    msg.Printf(_T("Text seems to be pure ASCII!\n"
                                  "We use user specified encoding: %s (ID: %d)"),
                               wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                               m_Encoding);
                    Manager::Get()->GetLogManager()->DebugLog(msg);
                }
            }

            if (m_Encoding < 0)
            {
                // Use user-specified one; as a fallback
                m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false);
                if (m_UseLog)
                {
                    wxString msg;
                    msg.Printf(_T("Warning: Using user specified encoding as fallback!\n"
                                  "Encoding fallback is: %s (ID: %d)"),
                               wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                               m_Encoding);
                    Manager::Get()->GetLogManager()->DebugLog(msg);
                }
            }

            m_UseBOM = false;
            m_BOMSizeInBytes = 0;
        }
    }

    if (m_UseLog)
    {
        wxString msg;
        msg.Printf(_T("Final encoding detected: %s (ID: %d)"),
                   wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(),
                   m_Encoding);
        Manager::Get()->GetLogManager()->DebugLog(msg);
    }

    if (convert_to_wxstring && !ConvertToWxString(buffer, size) && m_UseLog)
        Manager::Get()->GetLogManager()->DebugLog(_T("Something seriously went wrong while converting file content to wxString!"));

    return true;
}
Exemple #4
0
UCDetect::UCDetect(const std::string &file)
: nsUniversalDetector(NS_FILTER_ALL)
{
	{
		agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));

		// If it's over 100 MB it's either binary or big enough that we won't
		// be able to do anything useful with it anyway
		fp->seekg(0, std::ios::end);
		if (fp->tellg() > 100 * 1024 * 1024) {
			list.emplace_back(1.f, "binary");
			return;
		}
		fp->seekg(0, std::ios::beg);

		std::streamsize binaryish = 0;
		std::streamsize bytes = 0;

		while (!mDone && *fp) {
			char buf[4096];
			fp->read(buf, sizeof(buf));
			std::streamsize read = fp->gcount();
			HandleData(buf, (PRUint32)read);

			// A dumb heuristic to detect binary files
			if (!mDone) {
				bytes += read;
				for (std::streamsize i = 0; i < read; ++i) {
					if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
						++binaryish;
				}

				if (binaryish > bytes / 8) {
					list.emplace_back(1.f, "binary");
					return;
				}
			}
		}
	}

	DataEnd();

	if (mDetectedCharset)
		list.emplace_back(1.f, mDetectedCharset);
	else {
		switch (mInputState) {
			case eHighbyte: {
				for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
					if (!mCharSetProbers[i]) continue;

					float conf = mCharSetProbers[i]->GetConfidence();
					if (conf > 0.01f)
						list.emplace_back(conf, mCharSetProbers[i]->GetCharSetName());
				}

				break;
			}
			case ePureAscii:
				list.emplace_back(1.f, "US-ASCII");
				break;

			default:
				throw UnknownCharset("Unknown character set.");
		}

		if (list.empty() && (mInputState == eHighbyte))
			throw UnknownCharset("Unknown character set.");

		typedef std::pair<float, std::string> const& result;
		sort(begin(list), end(list), [](result lft, result rgt) { return lft.first > rgt.first; });
	}
}