const char *Detector::Close(void) { DataEnd(); if (!mDone) { if (mInputState == eEscAscii) { return "ibm850"; } else if (mInputState == ePureAscii) { return "ASCII"; } return NULL; } return mDetectedCharset; }
void nsCyrillicDetector::HandleData(const char* aBuf, PRUint32 aLen) { PRUint8 cls; const char* b; PRUint32 i; if(mDone) return; for(i=0, b=aBuf;i<aLen;i++,b++) { for(PRUintn j=0;j<mItems;j++) { if( 0x80 & *b) cls = mCyrillicClass[j][(*b) & 0x7F]; else cls = 0; NS_ASSERTION( cls <= 32 , "illegal character class"); mProb[j] += gCyrillicProb[mLastCls[j]][cls]; mLastCls[j] = cls; } } // We now only based on the first block we receive DataEnd(); }
bool EncodingDetector::DetectEncoding(const wxByte* buffer, size_t size, bool convert_to_wxstring) { ConfigManager* cfgMgr = Manager::Get()->GetConfigManager(_T("editor")); wxString encname = cfgMgr->Read(_T("/default_encoding")); if (cfgMgr->ReadInt(_T("/default_encoding/use_option"), 0) == 1) { // Bypass C::B's auto-detection m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false); if (m_UseLog) { wxString msg; msg.Printf(_T("Warning: bypassing C::B's auto-detection!\n" "Encoding requested is: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } else { if (!buffer) return false; // Try our own detection for UTF-16 and UTF-32, the Mozilla-version does not work without BOM if ( DetectEncodingEx(buffer, size) ) { if (m_UseBOM && m_UseLog) { wxString msg; msg.Printf(_T("Detected encoding via BOM: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } else { //{ MOZILLA nsUniversalDetector START // If we still have no results try Mozilla (taken from nsUdetXPCOMWrapper.cpp): Reset(); nsresult res = HandleData((char*)buffer, size); if (res==NS_OK) DataEnd(); else { m_MozillaResult = wxEmptyString; if (m_UseLog) Manager::Get()->GetLogManager()->DebugLog(F(_T("Mozilla universal detection failed with %d."), res)); } //} MOZILLA nsUniversalDetector END if ( !m_MozillaResult.IsEmpty() ) m_Encoding = wxFontMapper::Get()->CharsetToEncoding(m_MozillaResult, false); if (m_Encoding == wxFONTENCODING_DEFAULT) { wxString enc_name = Manager::Get()->GetConfigManager(_T("editor"))->Read(_T("/default_encoding"), wxLocale::GetSystemEncodingName()); m_Encoding = wxFontMapper::GetEncodingFromName(enc_name); if (m_UseLog) { wxString msg; msg.Printf(_T("Text seems to be pure ASCII!\n" "We use user specified encoding: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } if (m_Encoding < 0) { // Use user-specified one; as a fallback m_Encoding = wxFontMapper::Get()->CharsetToEncoding(encname, false); if (m_UseLog) { wxString msg; msg.Printf(_T("Warning: Using user specified encoding as fallback!\n" "Encoding fallback is: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } } m_UseBOM = false; m_BOMSizeInBytes = 0; } } if (m_UseLog) { wxString msg; msg.Printf(_T("Final encoding detected: %s (ID: %d)"), wxFontMapper::Get()->GetEncodingDescription(m_Encoding).c_str(), m_Encoding); Manager::Get()->GetLogManager()->DebugLog(msg); } if (convert_to_wxstring && !ConvertToWxString(buffer, size) && m_UseLog) Manager::Get()->GetLogManager()->DebugLog(_T("Something seriously went wrong while converting file content to wxString!")); return true; }
UCDetect::UCDetect(const std::string &file) : nsUniversalDetector(NS_FILTER_ALL) { { agi::scoped_ptr<std::ifstream> fp(io::Open(file, true)); // If it's over 100 MB it's either binary or big enough that we won't // be able to do anything useful with it anyway fp->seekg(0, std::ios::end); if (fp->tellg() > 100 * 1024 * 1024) { list.emplace_back(1.f, "binary"); return; } fp->seekg(0, std::ios::beg); std::streamsize binaryish = 0; std::streamsize bytes = 0; while (!mDone && *fp) { char buf[4096]; fp->read(buf, sizeof(buf)); std::streamsize read = fp->gcount(); HandleData(buf, (PRUint32)read); // A dumb heuristic to detect binary files if (!mDone) { bytes += read; for (std::streamsize i = 0; i < read; ++i) { if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t')) ++binaryish; } if (binaryish > bytes / 8) { list.emplace_back(1.f, "binary"); return; } } } } DataEnd(); if (mDetectedCharset) list.emplace_back(1.f, mDetectedCharset); else { switch (mInputState) { case eHighbyte: { for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) { if (!mCharSetProbers[i]) continue; float conf = mCharSetProbers[i]->GetConfidence(); if (conf > 0.01f) list.emplace_back(conf, mCharSetProbers[i]->GetCharSetName()); } break; } case ePureAscii: list.emplace_back(1.f, "US-ASCII"); break; default: throw UnknownCharset("Unknown character set."); } if (list.empty() && (mInputState == eHighbyte)) throw UnknownCharset("Unknown character set."); typedef std::pair<float, std::string> const& result; sort(begin(list), end(list), [](result lft, result rgt) { return lft.first > rgt.first; }); } }