bool MCharsetMatch::operator>(const MCharsetMatch &other) const { if(this->confidence() > other.confidence()) return true; else if (this->confidence() == other.confidence() && !this->language().isEmpty() && other.language().isEmpty()) return true; else return false; }
gchar * tracker_encoding_guess_meegotouch (const gchar *buffer, gsize size) { /* Initialize detector */ MCharsetDetector detector ((const char *)buffer, (int)size); gchar *locale; gchar *encoding = NULL; if (detector.hasError ()) { g_warning ("Charset detector error when creating: %s", detector.errorString ().toUtf8 (). data ()); return NULL; } locale = tracker_locale_get (TRACKER_LOCALE_LANGUAGE); detector.setDeclaredLocale (locale); MCharsetMatch bestMatch = detector.detect (); if (detector.hasError ()) { g_warning ("Charset detector error when detecting: %s", detector.errorString ().toUtf8 (). data ()); g_free (locale); return NULL; } if (bestMatch.confidence () > 30) { encoding = g_strdup (bestMatch.name ().toUtf8 ().data ()); #if 0 QList<MCharsetMatch> mCharsetMatchList = detector.detectAll(); if (detector.hasError ()) { g_warning ("Charset detector error when detecting all: %s", detector.errorString ().toUtf8 (). data ()); } g_debug ("Detecting all charsets..."); for (gint i = 0; i < mCharsetMatchList.size (); ++i) { g_debug (" Charset '%s' with %d%% confidence...", mCharsetMatchList[i].name (). toUtf8 ().data (), mCharsetMatchList[i].confidence ()); } #endif g_debug ("Guessing charset as '%s' with %d%% confidence", encoding, bestMatch.confidence ()); } else { g_debug ("Ignoring charset as '%s' with %d%% (< 30%%) confidence", bestMatch.name ().toUtf8 ().data (), bestMatch.confidence ()); } g_free (locale); return encoding; }
QString MCharsetDetector::text(const MCharsetMatch &charsetMatch) { Q_D(MCharsetDetector); clearError(); QTextCodec *codec = QTextCodec::codecForName(charsetMatch.name().toLatin1()); if (codec == NULL) { // there is no codec matching the name d->_status = U_ILLEGAL_ARGUMENT_ERROR; qWarning() << __PRETTY_FUNCTION__ << "no codec for the name" << charsetMatch.name() << errorString(); // return empty string to indicate that no conversion is possible: return QString(); } else { QTextCodec::ConverterState state; QString text = codec->toUnicode(d->_ba.constData(), d->_ba.size(), &state); if (state.invalidChars > 0) d->_status = U_INVALID_CHAR_FOUND; return text; } }
QList<MCharsetMatch> MCharsetDetector::detectAll() { Q_D(MCharsetDetector); clearError(); // get list of matches from ICU: qint32 matchesFound; const UCharsetMatch **uCharsetMatch = ucsdet_detectAll(d->_uCharsetDetector, &matchesFound, &(d->_status)); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<MCharsetMatch>(); } // sometimes the number of matches found by ucsdet_detectAll() // maybe 0 (matchesFound == 0) but d->_status has no error. Do not // return here with an error if this happens because the fine // tuning below may add more matches. Better check whether no // matches were found at all *after* the fine tuning. // fill list of matches into a QList<MCharsetMatch>: QList<MCharsetMatch> mCharsetMatchList; for (qint32 i = 0; i < matchesFound; ++i) { MCharsetMatch mCharsetMatch; mCharsetMatch.setName( QString::fromLatin1(ucsdet_getName(uCharsetMatch[i], &(d->_status)))); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<MCharsetMatch>(); } mCharsetMatch.setConfidence( static_cast<qint32>(ucsdet_getConfidence (uCharsetMatch[i], &(d->_status)))); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<MCharsetMatch>(); } mCharsetMatch.setLanguage( QString::fromLatin1(ucsdet_getLanguage(uCharsetMatch[i], &(d->_status)))); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<MCharsetMatch>(); } mCharsetMatchList << mCharsetMatch; } if(d->_allDetectableCharsets.isEmpty()) getAllDetectableCharsets(); // libicu sometimes does not detect single byte encodings at all // even if they can encode the input without error. This seems to // contradict the documentation on // http://icu-project.org/apiref/icu4c/ucsdet_8h.html which says: // // A confidence value of ten does have a general meaning - it is // used for charsets that can represent the input data, but for // which there is no other indication that suggests that the // charset is the correct one. Pure 7 bit ASCII data, for example, // is compatible with a great many charsets, most of which will // appear as possible matches with a confidence of 10. // // But if such a single byte encoding has been set as the declared // encoding, it should at least be tried, therefore add it here to // the list of matches with the confidence value of 10. If it // cannot encode the complete input, the iteration over the list // of matches will detect that and remove it again. if(!d->_declaredEncoding.isEmpty() && (d->_declaredEncoding.startsWith(QLatin1String("ISO-8859-")) || d->_declaredEncoding.startsWith(QLatin1String("windows-12")) || d->_declaredEncoding.startsWith(QLatin1String("KOI8")))) mCharsetMatchList << MCharsetMatch(d->_declaredEncoding, "", 10); // Similar as for declaredEncoding, when declaredLocale is used // and it is a locale where the legacy encoding is a single byte // encoding, it should at least be tried, therefore add the legacy // single byte encoding for the declared locale here. If it // cannot encode the complete input, it will be removed again // later. Multibyte encodings like Shift_JIS, EUC-JP, Big5, // etc. ... do not need to be added, contrary to the single byte // encodings I could find no case where the matches returned by // libicu did omit a multibyte encoding when it should have been // included. if(!d->_declaredLocale.isEmpty()) { QString language = d->_declaredLocale.left(2); if(language == QLatin1String("ru")) { mCharsetMatchList << MCharsetMatch("KOI8-R", language, 10); mCharsetMatchList << MCharsetMatch("windows-1251", language, 10); mCharsetMatchList << MCharsetMatch("ISO-8859-5", language, 10); } else if(language == QLatin1String("uk")) { mCharsetMatchList << MCharsetMatch("KOI8-U", language, 10); mCharsetMatchList << MCharsetMatch("windows-1251", language, 10); // ISO 8859-5 encoding is missing the letter ґ needed for // Ukrainian, i.e. ISO 8859-5 should not occur for Ukrainian } else if(language == QLatin1String("tr")) mCharsetMatchList << MCharsetMatch("ISO-8859-9", language, 10); else if(language == QLatin1String("el")) mCharsetMatchList << MCharsetMatch("ISO-8859-7", language, 10); else if(language == QLatin1String("en") || language == QLatin1String("da") || language == QLatin1String("de") || language == QLatin1String("es") || language == QLatin1String("fi") || language == QLatin1String("fr") || language == QLatin1String("it") || language == QLatin1String("nl") || language == QLatin1String("no") || language == QLatin1String("nn") || language == QLatin1String("nb") || language == QLatin1String("pt") || language == QLatin1String("sv")) mCharsetMatchList << MCharsetMatch("ISO-8859-1", language, 10); else if(language == QLatin1String("cs") || language == QLatin1String("hu") || language == QLatin1String("pl") || language == QLatin1String("ro")) mCharsetMatchList << MCharsetMatch("ISO-8859-1", language, 10); else if(language == QLatin1String("ar") || language == QLatin1String("fa") || language == QLatin1String("ur")) mCharsetMatchList << MCharsetMatch("ISO-8859-6", language, 10); else if(language == QLatin1String("he")) mCharsetMatchList << MCharsetMatch("ISO-8859-8", language, 10); } // iterate over the detected matches and do some fine tuning: bool sortNeeded = false; qint32 koi8rConfidence = 0; qint32 koi8uConfidence = 0; qint32 iso88595Confidence = 0; qint32 windows1251Confidence = 0; QList<MCharsetMatch>::iterator it = mCharsetMatchList.begin(); while(it != mCharsetMatchList.end()) { if((*it).name() == QLatin1String("KOI8-R")) koi8rConfidence += (*it).confidence(); if((*it).name() == QLatin1String("KOI8-U")) koi8uConfidence += (*it).confidence(); if((*it).name() == QLatin1String("ISO-8859-5")) iso88595Confidence += (*it).confidence(); if((*it).name() == QLatin1String("windows-1251")) windows1251Confidence += (*it).confidence(); if((*it).name() == QLatin1String("ISO-2022-JP")) { // non-Japanese text in ISO-2022-JP encoding is possible // but very unlikely: (*it).setLanguage("ja"); } if((*it).name() == QLatin1String("UTF-8") && (*it).confidence() >= 80 && (*it).confidence() < 99) { // Actually libicu currently only returns confidence // values of 100, 80, 25, and 10 for UTF-8. A value of 80 // can mean two things: // // 1) (hasBOM && numValid > numInvalid*10) // 2) (numValid > 0 && numInvalid == 0) // // If it is case 1), the match will be removed anyway by // the check below which tests whether the complete input // can be encoded. I.e. we don’t need to care about this. // // If it is case 2) *and* the check below whether the // complete input can be encoded does not remove it, we // have valid UTF-8 and it is very unlikely that it is // anything else, therefore I think the confidence of 80 // is too low and should be increased. // With a confidence of only 80, a longer ASCII text with // less than 4 UTF-8 characters will detect as ISO-8859-1 // which is most certainly wrong. (*it).setConfidence(99); sortNeeded = true; } if(!d->_declaredEncoding.isEmpty() && (*it).name() == d->_declaredEncoding && (*it).confidence() == 10) { // A confidence value of 10 means the charset can // represent the input data, but there is no other // indication that suggests that the charset is the // correct one. But if the user has set this to be the // declared encoding, it should be preferred over the // other encodings which also got confidence 10 (there are // often many with confidence 10). Do not increase the // confidence too much though in order not to override // real evidence that the input does really use something // different than the declared encoding. (*it).setConfidence(40); sortNeeded = true; } if(!d->_declaredLocale.isEmpty() && d->_declaredLocale.startsWith((*it).language()) && (*it).confidence() == 10) { // A confidence value of 10 means the charset can // represent the input data, but there is no other // indication that suggests that the charset is the // correct one. But if the detected language for this // charset matches the language declared by the user, this // charset should be preferred over the others which also // got confidence 10 (there are often many with confidence // 10). Do not increase the confidence too much though in // order not to override real evidence that the input does // really use something different than the declared // encoding. Use a slightly lower value than for the // declared encoding. Setting the declared encoding // is more precise and should have somewhat higher priority if(d->_declaredLocale.startsWith("ru")) { // Treat the Russian setDeclaredLocale("ru") case a // bit different than the single byte encodings for // other languages: Only increase the weight of // Russian encodings if setDeclaredLocale("ru") has // been used if libicu has really detected the same // Russian encoding as well. libicu usually detects // these Russian encodings with very low confidences < // 10 for short input. But if we are already pretty // sure that it is Russian because of // setDeclaredLocale("ru"), then these low confidences // detected by libicu seem to be useful to distinguish // between the different Russian legacy encodings. // // If the setDeclareLocale("ru") has been used, the // accumulated confidence for the Russian single byte // encoding is 10 (because of setDeclaredLocale("ru")) // plus whatever libicu has detected. If libicu has // not detected anything, the accumulated confidence // is exactly 10 here and there is no way to // distinguish between the Russian legacy // encodings. Therefore, don’t increase the confidence // if the accumulated confidence is not > 10. // // But if libicu has detected something with small // confidence, the accumulated confidence is 10 plus // something small. In that case, adding something // around 20 seems to work reasonably well. // // I add 20 to the confidence for KOI8-R and // ISO-8859-5 but 21 to the confidence for // windows-1251 to prefer windows-1251 a little bit // over ISO-8859-5. if((*it).name() == QLatin1String("KOI8-R") && koi8rConfidence > 10 && koi8rConfidence < 30) (*it).setConfidence(20 + koi8rConfidence); else if((*it).name() == QLatin1String("ISO-8859-5") && iso88595Confidence > 10 && iso88595Confidence < 30) (*it).setConfidence(20 + iso88595Confidence); else if((*it).name() == QLatin1String("windows-1251") && windows1251Confidence > 10 && windows1251Confidence < 30) (*it).setConfidence(21 + windows1251Confidence); } else if(d->_declaredLocale.startsWith("uk")) { // Treat the Ukrainian setDeclaredLocale("uk") case a // bit different than the single byte encodings for // Russian. // // If the setDeclareLocale("uk") has been used, the // accumulated confidence for the Ukrainian single byte // encoding is 10 (because of setDeclaredLocale("uk")) // plus whatever libicu has detected. If libicu has // not detected anything, the accumulated confidence // is exactly 10 here and there is no way to // distinguish between the Ukrainian legacy // encodings. Therefore, don’t increase the confidence // if the accumulated confidence is not > 10. // // But if libicu has detected something with small // confidence, the accumulated confidence is 10 plus // something small. In that case, adding something // around 20 seems to work reasonably well. // // I add 20 to the confidence for KOI8-U but 25 to the // confidence for windows-1251 to prefer windows-1251 // over KOI8-U. if((*it).name() == QLatin1String("KOI8-U") && koi8uConfidence > 10 && koi8uConfidence < 30) (*it).setConfidence(20 + koi8uConfidence); else if((*it).name() == QLatin1String("windows-1251") && windows1251Confidence > 10 && windows1251Confidence < 30) (*it).setConfidence(25 + windows1251Confidence); } else if((d->_declaredLocale.contains("TW") || d->_declaredLocale.contains("HK") || d->_declaredLocale.contains("MO")) && (*it).name() == QLatin1String("Big5")) { // Traditional Chinese, Big5 more likely (*it).setConfidence(39); } else if((d->_declaredLocale.contains("CN") || d->_declaredLocale.contains("SG") || d->_declaredLocale == "zh") && (*it).name() == QLatin1String("GB18030")) { // Simplified Chinese, GB18030/GB2312 more likely. // Simplified Chinese is also assumed if only “zh” // is set. If the variant is unknown, simplified // Chinese seems a bit more likely. On top of that, // the settings application sets only “zh” for // simplified Chinese and the translations for // simplified Chinese are also in files like // “foo_zh.qm” which makes simplified Chinese more // likely when only “zh” is set on the device (see // also NB#242154). (*it).setConfidence(39); } else { (*it).setConfidence(38); } sortNeeded = true; } if(!d->_allDetectableCharsets.contains((*it).name())) { // remove matches for charsets not supported by QTextCodec // then it is probably some weird charset we cannot use anyway it = mCharsetMatchList.erase(it); } else { // test whether the complete input text can be encoded // using this match, if not remove the match clearError(); text(*it); if(hasError()) { // qDebug() << __PRETTY_FUNCTION__ // << "removing match" << (*it).name() // << "because it cannot encode the complete input" // << errorString(); it = mCharsetMatchList.erase(it); clearError(); } else ++it; } } // sort the list of matches again if confidences have been changed: if(sortNeeded) qSort(mCharsetMatchList.begin(), mCharsetMatchList.end(), qGreater<MCharsetMatch>()); if(mCharsetMatchList.isEmpty()) { // is there any better status to describe this case? d->_status = U_CE_NOT_FOUND_ERROR; qWarning() << __PRETTY_FUNCTION__ << "number of matches found=0" << errorString(); return QList<MCharsetMatch>(); } return mCharsetMatchList; }