コード例 #1
0
bool MCharsetMatch::operator>(const MCharsetMatch &other) const
{
    if(this->confidence() > other.confidence())
        return true;
    else if (this->confidence() == other.confidence()
             && !this->language().isEmpty()
             && other.language().isEmpty())
            return true;
    else
        return false;
}
コード例 #2
0
gchar *
tracker_encoding_guess_meegotouch (const gchar *buffer,
                                   gsize        size)
{
	/* Initialize detector */
	MCharsetDetector detector ((const char *)buffer, (int)size);
	gchar *locale;
	gchar *encoding = NULL;

	if (detector.hasError ()) {
		g_warning ("Charset detector error when creating: %s",
		           detector.errorString ().toUtf8 (). data ());
		return NULL;
	}

	locale = tracker_locale_get (TRACKER_LOCALE_LANGUAGE);
	detector.setDeclaredLocale (locale);

	MCharsetMatch bestMatch = detector.detect ();

	if (detector.hasError ()) {
		g_warning ("Charset detector error when detecting: %s",
		           detector.errorString ().toUtf8 (). data ());
		g_free (locale);
		return NULL;
	}

	if (bestMatch.confidence () > 30) {
		encoding = g_strdup (bestMatch.name ().toUtf8 ().data ());

#if 0
		QList<MCharsetMatch> mCharsetMatchList = detector.detectAll();

		if (detector.hasError ()) {
			g_warning ("Charset detector error when detecting all: %s",
			           detector.errorString ().toUtf8 (). data ());
		}

		g_debug ("Detecting all charsets...");
		for (gint i = 0; i < mCharsetMatchList.size (); ++i) {
			g_debug ("  Charset '%s' with %d%% confidence...",
			         mCharsetMatchList[i].name (). toUtf8 ().data (),
			         mCharsetMatchList[i].confidence ());
		}
#endif

		g_debug ("Guessing charset as '%s' with %d%% confidence",
		         encoding, bestMatch.confidence ());
	} else {
		g_debug ("Ignoring charset as '%s' with %d%% (< 30%%) confidence",
		         bestMatch.name ().toUtf8 ().data (),
		         bestMatch.confidence ());
	}

	g_free (locale);

	return encoding;
}
コード例 #3
0
ファイル: mcharsetdetector.cpp プロジェクト: amtep/libmlocale
QString MCharsetDetector::text(const MCharsetMatch &charsetMatch)
{
    Q_D(MCharsetDetector);
    clearError();
    QTextCodec *codec
        = QTextCodec::codecForName(charsetMatch.name().toLatin1());
    if (codec == NULL) { // there is no codec matching the name
        d->_status = U_ILLEGAL_ARGUMENT_ERROR;
        qWarning() << __PRETTY_FUNCTION__
                 << "no codec for the name" << charsetMatch.name()
                 << errorString();
        // return empty string to indicate that no conversion is possible:
        return QString();
    }
    else {
        QTextCodec::ConverterState state;
        QString text =
            codec->toUnicode(d->_ba.constData(), d->_ba.size(), &state);
        if (state.invalidChars > 0)
            d->_status = U_INVALID_CHAR_FOUND;
        return text;
    }
}
コード例 #4
0
ファイル: mcharsetdetector.cpp プロジェクト: amtep/libmlocale
QList<MCharsetMatch> MCharsetDetector::detectAll()
{
    Q_D(MCharsetDetector);
    clearError();
    // get list of matches from ICU:
    qint32 matchesFound;
    const UCharsetMatch **uCharsetMatch
        = ucsdet_detectAll(d->_uCharsetDetector, &matchesFound, &(d->_status));
    if(hasError()) {
        qWarning() << __PRETTY_FUNCTION__ << errorString();
        return QList<MCharsetMatch>();
    }
    // sometimes the number of matches found by ucsdet_detectAll()
    // maybe 0 (matchesFound == 0) but d->_status has no error. Do not
    // return here with an error if this happens because the fine
    // tuning below may add more matches.  Better check whether no
    // matches were found at all *after* the fine tuning.

    // fill list of matches into a QList<MCharsetMatch>:
    QList<MCharsetMatch> mCharsetMatchList;
    for (qint32 i = 0; i < matchesFound; ++i) {
        MCharsetMatch mCharsetMatch;
        mCharsetMatch.setName(
            QString::fromLatin1(ucsdet_getName(uCharsetMatch[i], &(d->_status))));
        if(hasError()) {
            qWarning() << __PRETTY_FUNCTION__ << errorString();
            return QList<MCharsetMatch>();
        }
        mCharsetMatch.setConfidence(
            static_cast<qint32>(ucsdet_getConfidence (uCharsetMatch[i], &(d->_status))));
        if(hasError()) {
            qWarning() << __PRETTY_FUNCTION__ << errorString();
            return QList<MCharsetMatch>();
        }
        mCharsetMatch.setLanguage(
            QString::fromLatin1(ucsdet_getLanguage(uCharsetMatch[i], &(d->_status))));
        if(hasError()) {
            qWarning() << __PRETTY_FUNCTION__ << errorString();
            return QList<MCharsetMatch>();
        }
        mCharsetMatchList << mCharsetMatch;
    }
    if(d->_allDetectableCharsets.isEmpty())
        getAllDetectableCharsets();
    // libicu sometimes does not detect single byte encodings at all
    // even if they can encode the input without error. This seems to
    // contradict the documentation on
    // http://icu-project.org/apiref/icu4c/ucsdet_8h.html which says:
    //
    //     A confidence value of ten does have a general meaning - it is
    //     used for charsets that can represent the input data, but for
    //     which there is no other indication that suggests that the
    //     charset is the correct one. Pure 7 bit ASCII data, for example,
    //     is compatible with a great many charsets, most of which will
    //     appear as possible matches with a confidence of 10.
    //
    // But if such a single byte encoding has been set as the declared
    // encoding, it should at least be tried, therefore add it here to
    // the list of matches with the confidence value of 10. If it
    // cannot encode the complete input, the iteration over the list
    // of matches will detect that and remove it again.
    if(!d->_declaredEncoding.isEmpty()
        && (d->_declaredEncoding.startsWith(QLatin1String("ISO-8859-"))
            || d->_declaredEncoding.startsWith(QLatin1String("windows-12"))
            || d->_declaredEncoding.startsWith(QLatin1String("KOI8"))))
            mCharsetMatchList << MCharsetMatch(d->_declaredEncoding, "", 10);
    // Similar as for declaredEncoding, when declaredLocale is used
    // and it is a locale where the legacy encoding is a single byte
    // encoding, it should at least be tried, therefore add the legacy
    // single byte encoding for the declared locale here.  If it
    // cannot encode the complete input, it will be removed again
    // later.  Multibyte encodings like Shift_JIS, EUC-JP, Big5,
    // etc. ...  do not need to be added, contrary to the single byte
    // encodings I could find no case where the matches returned by
    // libicu did omit a multibyte encoding when it should have been
    // included.
    if(!d->_declaredLocale.isEmpty()) {
        QString language = d->_declaredLocale.left(2);
        if(language ==  QLatin1String("ru")) {
            mCharsetMatchList << MCharsetMatch("KOI8-R", language, 10);
            mCharsetMatchList << MCharsetMatch("windows-1251", language, 10);
            mCharsetMatchList << MCharsetMatch("ISO-8859-5", language, 10);
        }
        else if(language ==  QLatin1String("uk")) {
            mCharsetMatchList << MCharsetMatch("KOI8-U", language, 10);
            mCharsetMatchList << MCharsetMatch("windows-1251", language, 10);
            // ISO 8859-5 encoding is missing the letter ґ needed for
            // Ukrainian, i.e. ISO 8859-5 should not occur for Ukrainian
        }
        else if(language == QLatin1String("tr"))
            mCharsetMatchList << MCharsetMatch("ISO-8859-9", language, 10);
        else if(language == QLatin1String("el"))
            mCharsetMatchList << MCharsetMatch("ISO-8859-7", language, 10);
        else if(language == QLatin1String("en")
                || language == QLatin1String("da")
                || language == QLatin1String("de")
                || language == QLatin1String("es")
                || language == QLatin1String("fi")
                || language == QLatin1String("fr")
                || language == QLatin1String("it")
                || language == QLatin1String("nl")
                || language == QLatin1String("no")
                || language == QLatin1String("nn")
                || language == QLatin1String("nb")
                || language == QLatin1String("pt")
                || language == QLatin1String("sv"))
            mCharsetMatchList << MCharsetMatch("ISO-8859-1", language, 10);
        else if(language == QLatin1String("cs")
                || language == QLatin1String("hu")
                || language == QLatin1String("pl")
                || language == QLatin1String("ro"))
            mCharsetMatchList << MCharsetMatch("ISO-8859-1", language, 10);
        else if(language == QLatin1String("ar")
                || language == QLatin1String("fa")
                || language == QLatin1String("ur"))
            mCharsetMatchList << MCharsetMatch("ISO-8859-6", language, 10);
        else if(language == QLatin1String("he"))
            mCharsetMatchList << MCharsetMatch("ISO-8859-8", language, 10);
    }
    // iterate over the detected matches and do some fine tuning:
    bool sortNeeded = false;
    qint32 koi8rConfidence = 0;
    qint32 koi8uConfidence = 0;
    qint32 iso88595Confidence = 0;
    qint32 windows1251Confidence = 0;
    QList<MCharsetMatch>::iterator it = mCharsetMatchList.begin();
    while(it != mCharsetMatchList.end()) {
        if((*it).name() == QLatin1String("KOI8-R"))
            koi8rConfidence += (*it).confidence();
        if((*it).name() == QLatin1String("KOI8-U"))
            koi8uConfidence += (*it).confidence();
        if((*it).name() == QLatin1String("ISO-8859-5"))
            iso88595Confidence += (*it).confidence();
        if((*it).name() == QLatin1String("windows-1251"))
            windows1251Confidence += (*it).confidence();
        if((*it).name() == QLatin1String("ISO-2022-JP")) {
            // non-Japanese text in ISO-2022-JP encoding is possible
            // but very unlikely:
            (*it).setLanguage("ja");
        }
        if((*it).name() == QLatin1String("UTF-8")
           && (*it).confidence() >= 80 && (*it).confidence() < 99) {
            // Actually libicu currently only returns confidence
            // values of 100, 80, 25, and 10 for UTF-8.  A value of 80
            // can mean two things:
            //
            // 1)  (hasBOM && numValid > numInvalid*10)
            // 2)  (numValid > 0 && numInvalid == 0)
            //
            // If it is case 1), the match will be removed anyway by
            // the check below which tests whether the complete input
            // can be encoded. I.e. we don’t need to care about this.
            //
            // If it is case 2) *and* the check below whether the
            // complete input can be encoded does not remove it, we
            // have valid UTF-8 and it is very unlikely that it is
            // anything else, therefore I think the confidence of 80
            // is too low and should be increased.
            // With a confidence of only 80, a longer ASCII text with
            // less than 4 UTF-8 characters will detect as ISO-8859-1
            // which is most certainly wrong.
            (*it).setConfidence(99);
            sortNeeded = true;
        }
        if(!d->_declaredEncoding.isEmpty()
           && (*it).name() == d->_declaredEncoding
           && (*it).confidence() == 10) {
            // A confidence value of 10 means the charset can
            // represent the input data, but there is no other
            // indication that suggests that the charset is the
            // correct one. But if the user has set this to be the
            // declared encoding, it should be preferred over the
            // other encodings which also got confidence 10 (there are
            // often many with confidence 10). Do not increase the
            // confidence too much though in order not to override
            // real evidence that the input does really use something
            // different than the declared encoding.
            (*it).setConfidence(40);
            sortNeeded = true;
        }
        if(!d->_declaredLocale.isEmpty()
           && d->_declaredLocale.startsWith((*it).language())
           && (*it).confidence() == 10) {
            // A confidence value of 10 means the charset can
            // represent the input data, but there is no other
            // indication that suggests that the charset is the
            // correct one. But if the detected language for this
            // charset matches the language declared by the user, this
            // charset should be preferred over the others which also
            // got confidence 10 (there are often many with confidence
            // 10). Do not increase the confidence too much though in
            // order not to override real evidence that the input does
            // really use something different than the declared
            // encoding.  Use a slightly lower value than for the
            // declared encoding. Setting the declared encoding
            // is more precise and should have somewhat higher priority
            if(d->_declaredLocale.startsWith("ru")) {
                // Treat the Russian setDeclaredLocale("ru") case a
                // bit different than the single byte encodings for
                // other languages: Only increase the weight of
                // Russian encodings if setDeclaredLocale("ru") has
                // been used if libicu has really detected the same
                // Russian encoding as well. libicu usually detects
                // these Russian encodings with very low confidences <
                // 10 for short input.  But if we are already pretty
                // sure that it is Russian because of
                // setDeclaredLocale("ru"), then these low confidences
                // detected by libicu seem to be useful to distinguish
                // between the different Russian legacy encodings.
                //
                // If the setDeclareLocale("ru") has been used, the
                // accumulated confidence for the Russian single byte
                // encoding is 10 (because of setDeclaredLocale("ru"))
                // plus whatever libicu has detected. If libicu has
                // not detected anything, the accumulated confidence
                // is exactly 10 here and there is no way to
                // distinguish between the Russian legacy
                // encodings. Therefore, don’t increase the confidence
                // if the accumulated confidence is not > 10.
                //
                // But if libicu has detected something with small
                // confidence, the accumulated confidence is 10 plus
                // something small. In that case, adding something
                // around 20 seems to work reasonably well.
                //
                // I add 20 to the confidence for KOI8-R and
                // ISO-8859-5 but 21 to the confidence for
                // windows-1251 to prefer windows-1251 a little bit
                // over ISO-8859-5.
                if((*it).name() == QLatin1String("KOI8-R")
                   && koi8rConfidence > 10 && koi8rConfidence < 30)
                    (*it).setConfidence(20 + koi8rConfidence);
                else if((*it).name() == QLatin1String("ISO-8859-5")
                   && iso88595Confidence > 10 && iso88595Confidence < 30)
                    (*it).setConfidence(20 + iso88595Confidence);
                else if((*it).name() == QLatin1String("windows-1251")
                   && windows1251Confidence > 10 && windows1251Confidence < 30)
                    (*it).setConfidence(21 + windows1251Confidence);
            }
            else if(d->_declaredLocale.startsWith("uk")) {
                // Treat the Ukrainian setDeclaredLocale("uk") case a
                // bit different than the single byte encodings for
                // Russian.
                //
                // If the setDeclareLocale("uk") has been used, the
                // accumulated confidence for the Ukrainian single byte
                // encoding is 10 (because of setDeclaredLocale("uk"))
                // plus whatever libicu has detected. If libicu has
                // not detected anything, the accumulated confidence
                // is exactly 10 here and there is no way to
                // distinguish between the Ukrainian legacy
                // encodings. Therefore, don’t increase the confidence
                // if the accumulated confidence is not > 10.
                //
                // But if libicu has detected something with small
                // confidence, the accumulated confidence is 10 plus
                // something small. In that case, adding something
                // around 20 seems to work reasonably well.
                //
                // I add 20 to the confidence for KOI8-U but 25 to the
                // confidence for windows-1251 to prefer windows-1251
                // over KOI8-U.
                if((*it).name() == QLatin1String("KOI8-U")
                   && koi8uConfidence > 10 && koi8uConfidence < 30)
                    (*it).setConfidence(20 + koi8uConfidence);
                else if((*it).name() == QLatin1String("windows-1251")
                   && windows1251Confidence > 10 && windows1251Confidence < 30)
                    (*it).setConfidence(25 + windows1251Confidence);
            }
            else if((d->_declaredLocale.contains("TW")
                || d->_declaredLocale.contains("HK")
                || d->_declaredLocale.contains("MO"))
               && (*it).name() == QLatin1String("Big5")) {
                 // Traditional Chinese, Big5 more likely
                (*it).setConfidence(39);
            }
            else if((d->_declaredLocale.contains("CN")
                     || d->_declaredLocale.contains("SG")
                     || d->_declaredLocale == "zh")
                    && (*it).name() == QLatin1String("GB18030")) {
                // Simplified Chinese, GB18030/GB2312 more likely.
                // Simplified Chinese is also assumed if only “zh”
                // is set. If the variant is unknown, simplified
                // Chinese seems a bit more likely. On top of that,
                // the settings application sets only “zh” for
                // simplified Chinese and the translations for
                // simplified Chinese are also in files like
                // “foo_zh.qm” which makes simplified Chinese more
                // likely when only “zh” is set on the device (see
                // also NB#242154).
                (*it).setConfidence(39);
            }
            else {
                (*it).setConfidence(38);
            }
            sortNeeded = true;
        }
        if(!d->_allDetectableCharsets.contains((*it).name())) {
            // remove matches for charsets not supported by QTextCodec
            // then it is probably some weird charset we cannot use anyway
            it = mCharsetMatchList.erase(it);
        }
        else {
            // test whether the complete input text can be encoded
            // using this match, if not remove the match
            clearError();
            text(*it);
            if(hasError()) {
                // qDebug() << __PRETTY_FUNCTION__
                //          << "removing match" << (*it).name()
                //          << "because it cannot encode the complete input"
                //          << errorString();
                it = mCharsetMatchList.erase(it);
                clearError();
            }
            else
                ++it;
        }
    }
    // sort the list of matches again if confidences have been changed:
    if(sortNeeded)
        qSort(mCharsetMatchList.begin(), mCharsetMatchList.end(),
              qGreater<MCharsetMatch>());
    if(mCharsetMatchList.isEmpty()) {
        // is there any better status to describe this case?
        d->_status = U_CE_NOT_FOUND_ERROR;
        qWarning() << __PRETTY_FUNCTION__
                 << "number of matches found=0"
                 << errorString();
        return QList<MCharsetMatch>();
    }
    return mCharsetMatchList;
}