C++ (Cpp) ucsdet_detectAll 예제들

예제 #1

0

파일 보기

파일: ext_icu_ucsdet.cpp 프로젝트: 292388900/hhvm

static Array HHVM_METHOD(EncodingDetector, detectAll) {
  FETCH_DET(data, this_);
  UErrorCode error = U_ZERO_ERROR;
  int32_t count = 0;
  auto matches = ucsdet_detectAll(data->detector(), &count, &error);
  if (U_FAILURE(error)) {
    data->throwException("Could not detect all encodings, error %d (%s)",
                         error, u_errorName(error));
  }
  Array ret = Array::Create();
  for (int i = 0; i < count; ++i) {
    ret.append(EncodingMatch::newInstance(matches[i]));
  }
  return ret;
}

예제 #2

0

파일 보기

파일: uchardet.c 프로젝트: sraach/uchardet

/*
 * call-seq:
 *   detect_all(text=nil, declared_encoding=nil)
 *
 * Find all charset matches that appear to be consistent with the input,
 * returning an array of results.  The results are ordered with the
 * best quality match first.
 *
 * Because the detection only looks at a limited amount of the
 * input byte data, some of the returned charsets may fail to handle
 * the all of input data.
 *
 * Return an error if 
 * * no charset appears to match the data
 * * no input text has been provided (with +text+ or set with #text= )
 */
static VALUE
UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    set_text(self, text);
    set_declared_encoding(self, declared_encoding);
    
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    UErrorCode status = U_ZERO_ERROR;
    int32_t matches_found;
    
    const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
    ensure(status);
    
    VALUE ary = rb_ary_new();
    int i = 0;
    
    for (i = 0; i < matches_found; i++) {
        const char *encoding_name = ucsdet_getName(matches[i], &status);
        ensure(status);

        int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
        ensure(status);
        
        const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
        ensure(status);
        
        VALUE hash = rb_hash_new();
        rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
        rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
        rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
        
        rb_ary_push(ary, hash);
    }
    
    return ary;
}

예제 #3

0

파일 보기

파일: ext_icu_ucsdet.cpp 프로젝트: Alienfeel/hhvm

Array c_EncodingDetector::t_detectall() {
  int32_t matchesFound;
  UErrorCode status = U_ZERO_ERROR;
  const UCharsetMatch** matches = ucsdet_detectAll(
    m_encoding_detector,
    &matchesFound,
    &status);
  if (U_FAILURE(status)) {
    throw Exception(
      "Could not detect all encodings, error %d (%s)", status, u_errorName(status));
  }

  Array ret = Array::Create();
  int32_t i;
  for (i = 0; i < matchesFound; i++) {
    p_EncodingMatch matchobj = NEWOBJ(c_EncodingMatch)();
    matchobj->m_encoding_match = matches[i];
    ret.append(matchobj);
  }
  return ret;
}

예제 #4

0

파일 보기

파일: mod_websocket_conv.c 프로젝트: Fomich/mod_websocket

mod_websocket_bool_t
mod_websocket_conv_isUTF8(const char *data, size_t siz) {
    mod_websocket_bool_t ret = MOD_WEBSOCKET_FALSE;
    UErrorCode err = U_ZERO_ERROR;
    UCharsetDetector *detector = NULL;
    const UCharsetMatch **match;
    int32_t f = 0, i;
    const char *name;

    if (!data || !siz) {
        return MOD_WEBSOCKET_TRUE;
    }
    if (siz > INT32_MAX) {
        return MOD_WEBSOCKET_FALSE;
    }
    detector = ucsdet_open(&err);
    if (U_FAILURE(err)) {
        return MOD_WEBSOCKET_FALSE;
    }
    ucsdet_setText(detector, data, siz, &err);
    if (U_FAILURE(err)) {
        goto go_out;
    }
    match = ucsdet_detectAll(detector, &f, &err);
    if (U_FAILURE(err)) {
        goto go_out;
    }
    for (i = 0; i < f; i++) {
        name = ucsdet_getName(match[i], &err);
        if (strcasecmp(MOD_WEBSOCKET_UTF8_STR, name) == 0) {
            ret = MOD_WEBSOCKET_TRUE;
            break;
        }
    }

 go_out:
    ucsdet_close(detector);
    detector = NULL;
    return ret;
}

예제 #5

0

파일 보기

파일: encoding_detector.c 프로젝트: kkszysiu/pycharlockholmes

/*
 * Attempt to detect the encoding of this string, and return
 * a list with all the possible encodings that match it.
 *
 *
 * str      - a String, what you want to detect the encoding of
 * hint_enc - an optional String (like "UTF-8"), the encoding name which will
 *            be used as an additional hint to the charset detector
 *
 * Returns: an list with zero or more dicts
 *          each one of them with with encoding, language, type and confidence
 *          parameters
 */
PyObject *
charlockholmes_encoding_detect_all(PyObject *self, PyObject *args)
{
    PyObject *lst;
    PyObject *content;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch **matches;
    const char *mname;
    const char *mlang;
    const char *hint_enc = NULL;
    int mconfidence;
    int i, match_count;

    if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) {
        return NULL;
    }

    if (detect_binary_content(content)) {
        lst = PyList_New(1);
        if (!lst)
            return NULL;

        content = Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100);
        PyList_SET_ITEM(lst, 0, content);
        return lst;
    }

    if (hint_enc != NULL) {
        ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status);
    }

    ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status);
    matches = ucsdet_detectAll(ch_ucd, &match_count, &status);

    if (matches) {
        lst = PyList_New(match_count);
        if (!lst)
            return NULL;

    	for (i = 0; i < match_count; ++i) {
            mname = ucsdet_getName(matches[i], &status);
            mlang = ucsdet_getLanguage(matches[i], &status);
            mconfidence = ucsdet_getConfidence(matches[i], &status);
            if (mlang && mlang[0])
                content = Py_BuildValue("{ss,ss,si,ss}",
                        "type", "text",
                        "encoding", mname,
                        "confidence", mconfidence,
                        "language", mlang);
            else
                content = Py_BuildValue("{ss,ss,si}",
                        "type", "text",
                        "encoding", mname,
                        "confidence", mconfidence);

            PyList_SET_ITEM(lst, i, content);
        }

        return lst;
    }

    Py_INCREF(Py_None);
    return Py_None;
}

예제 #6

0

파일 보기

파일: qcharsetdetector.cpp 프로젝트: qt-labs/messagingframework

QList<QCharsetMatch> QCharsetDetector::detectAll()
{
    Q_D(QCharsetDetector);
    clearError();
    // get list of matches from ICU:
    qint32 matchesFound;
    const UCharsetMatch **uCharsetMatch
        = ucsdet_detectAll(d->_uCharsetDetector, &matchesFound, &(d->_status));
    if(hasError()) {
        qWarning() << __PRETTY_FUNCTION__ << errorString();
        return QList<QCharsetMatch>();
    }
    // sometimes the number of matches found by ucsdet_detectAll()
    // maybe 0 (matchesFound == 0) but d->_status has no error. Do not
    // return here with an error if this happens because the fine
    // tuning below may add more matches.  Better check whether no
    // matches were found at all *after* the fine tuning.

    // fill list of matches into a QList<QCharsetMatch>:
    QList<QCharsetMatch> qCharsetMatchList;
    for (qint32 i = 0; i < matchesFound; ++i) {
        QCharsetMatch qCharsetMatch;
        qCharsetMatch.setName(
            QString::fromLatin1(ucsdet_getName(uCharsetMatch[i], &(d->_status))));
        if(hasError()) {
            qWarning() << __PRETTY_FUNCTION__ << errorString();
            return QList<QCharsetMatch>();
        }
        qCharsetMatch.setConfidence(
            static_cast<qint32>(ucsdet_getConfidence (uCharsetMatch[i], &(d->_status))));
        if(hasError()) {
            qWarning() << __PRETTY_FUNCTION__ << errorString();
            return QList<QCharsetMatch>();
        }
        qCharsetMatch.setLanguage(
            QString::fromLatin1(ucsdet_getLanguage(uCharsetMatch[i], &(d->_status))));
        if(hasError()) {
            qWarning() << __PRETTY_FUNCTION__ << errorString();
            return QList<QCharsetMatch>();
        }
        qCharsetMatchList << qCharsetMatch;
    }
    if(d->_allDetectableCharsets.isEmpty())
        getAllDetectableCharsets();
    // libicu sometimes does not detect single byte encodings at all
    // even if they can encode the input without error. This seems to
    // contradict the documentation on
    // http://icu-project.org/apiref/icu4c/ucsdet_8h.html which says:
    //
    //     A confidence value of ten does have a general meaning - it is
    //     used for charsets that can represent the input data, but for
    //     which there is no other indication that suggests that the
    //     charset is the correct one. Pure 7 bit ASCII data, for example,
    //     is compatible with a great many charsets, most of which will
    //     appear as possible matches with a confidence of 10.
    //
    // But if such a single byte encoding has been set as the declared
    // encoding, it should at least be tried, therefore add it here to
    // the list of matches with the confidence value of 10. If it
    // cannot encode the complete input, the iteration over the list
    // of matches will detect that and remove it again.
    if(!d->_declaredEncoding.isEmpty()
        && (d->_declaredEncoding.startsWith(QLatin1String("ISO-8859-"))
            || d->_declaredEncoding.startsWith(QLatin1String("windows-12"))
            || d->_declaredEncoding.startsWith(QLatin1String("KOI8"))))
            qCharsetMatchList << QCharsetMatch(d->_declaredEncoding, QString(), 10);
    // Similar as for declaredEncoding, when declaredLocale is used
    // and it is a locale where the legacy encoding is a single byte
    // encoding, it should at least be tried, therefore add the legacy
    // single byte encoding for the declared locale here.  If it
    // cannot encode the complete input, it will be removed again
    // later.  Multibyte encodings like Shift_JIS, EUC-JP, Big5,
    // etc. ...  do not need to be added, contrary to the single byte
    // encodings I could find no case where the matches returned by
    // libicu did omit a multibyte encoding when it should have been
    // included.
    if(!d->_declaredLocale.isEmpty()) {
        QString language = d->_declaredLocale.left(2);
        if(language ==  QLatin1String("ru")) {
            qCharsetMatchList << QCharsetMatch(QLatin1String("KOI8-R"), language, 10);
            qCharsetMatchList << QCharsetMatch(QLatin1String("windows-1251"), language, 10);
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-5"), language, 10);
        }
        else if(language == QLatin1String("tr"))
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-9"), language, 10);
        else if(language == QLatin1String("el"))
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-7"), language, 10);
        else if(language == QLatin1String("en")
                || language == QLatin1String("da")
                || language == QLatin1String("de")
                || language == QLatin1String("es")
                || language == QLatin1String("fi")
                || language == QLatin1String("fr")
                || language == QLatin1String("it")
                || language == QLatin1String("nl")
                || language == QLatin1String("no")
                || language == QLatin1String("nn")
                || language == QLatin1String("nb")
                || language == QLatin1String("pt")
                || language == QLatin1String("sv"))
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-1"), language, 10);
        else if(language == QLatin1String("cs")
                || language == QLatin1String("hu")
                || language == QLatin1String("pl")
                || language == QLatin1String("ro"))
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-1"), language, 10);
        else if(language == QLatin1String("ar")
                || language == QLatin1String("fa")
                || language == QLatin1String("ur"))
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-6"), language, 10);
        else if(language == QLatin1String("he"))
            qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-8"), language, 10);
    }
    // iterate over the detected matches and do some fine tuning:
    bool sortNeeded = false;
    qint32 koi8rConfidence = 0;
    qint32 iso88595Confidence = 0;
    qint32 windows1251Confidence = 0;
    QList<QCharsetMatch>::iterator it = qCharsetMatchList.begin();
    while(it != qCharsetMatchList.end()) {
        if((*it).name() == QLatin1String("KOI8-R"))
            koi8rConfidence += (*it).confidence();
        if((*it).name() == QLatin1String("ISO-8859-5"))
            iso88595Confidence += (*it).confidence();
        if((*it).name() == QLatin1String("windows-1251"))
            windows1251Confidence += (*it).confidence();
        if((*it).name() == QLatin1String("ISO-2022-JP")) {
            // non-Japanese text in ISO-2022-JP encoding is possible
            // but very unlikely:
            (*it).setLanguage(QLatin1String("ja"));
        }
        if((*it).name() == QLatin1String("UTF-8")
           && (*it).confidence() >= 80 && (*it).confidence() < 99) {
            // Actually libicu currently only returns confidence
            // values of 100, 80, 25, and 10 for UTF-8.  A value of 80
            // can mean two things:
            //
            // 1)  (hasBOM && numValid > numInvalid*10)
            // 2)  (numValid > 0 && numInvalid == 0)
            //
            // If it is case 1), the match will be removed anyway by
            // the check below which tests whether the complete input
            // can be encoded. I.e. we don’t need to care about this.
            //
            // If it is case 2) *and* the check below whether the
            // complete input can be encoded does not remove it, we
            // have valid UTF-8 and it is very unlikely that it is
            // anything else, therefore I think the confidence of 80
            // is too low and should be increased.
            // With a confidence of only 80, a longer ASCII text with
            // less than 4 UTF-8 characters will detect as ISO-8859-1
            // which is most certainly wrong.
            (*it).setConfidence(99);
            sortNeeded = true;
        }
        if(!d->_declaredEncoding.isEmpty()
           && (*it).name() == d->_declaredEncoding
           && (*it).confidence() == 10) {
            // A confidence value of 10 means the charset can
            // represent the input data, but there is no other
            // indication that suggests that the charset is the
            // correct one. But if the user has set this to be the
            // declared encoding, it should be preferred over the
            // other encodings which also got confidence 10 (there are
            // often many with confidence 10). Do not increase the
            // confidence too much though in order not to override
            // real evidence that the input does really use something
            // different than the declared encoding.
            (*it).setConfidence(40);
            sortNeeded = true;
        }
        if(!d->_declaredLocale.isEmpty()
           && d->_declaredLocale.startsWith((*it).language())
           && (*it).confidence() == 10) {
            // A confidence value of 10 means the charset can
            // represent the input data, but there is no other
            // indication that suggests that the charset is the
            // correct one. But if the detected language for this
            // charset matches the language declared by the user, this
            // charset should be preferred over the others which also
            // got confidence 10 (there are often many with confidence
            // 10). Do not increase the confidence too much though in
            // order not to override real evidence that the input does
            // really use something different than the declared
            // encoding.  Use a slightly lower value than for the
            // declared encoding. Setting the declared encoding
            // is more precise and should have somewhat higher priority
            if (d->_declaredLocale.startsWith(QLatin1String("ru"))) {
                // Treat the Russian setDeclaredLocale("ru") case a
                // bit different than the single byte encodings for
                // other languages: Only increase the weight of
                // Russian encodings if setDeclaredLocale("ru") has
                // been used if libicu has really detected the same
                // Russian encoding as well. libicu usually detects
                // these Russian encodings with very low confidences <
                // 10 for short input.  But if we are already pretty
                // sure that it is Russian because of
                // setDeclaredLocale("ru"), then these low confidences
                // detected by libicu seem to be useful to distinguish
                // between the different Russian legacy encodings.
                //
                // If the setDeclareLocale("ru") has been used, the
                // accumulated confidence for the Russian single byte
                // encoding is 10 (because of setDeclaredLocale("ru"))
                // plus whatever libicu has detected. If libicu has
                // not detected anything, the accumulated confidence
                // is exactly 10 here and there is no way to
                // distinguish between the Russian legacy
                // encodings. Therefore, don’t increase the confidence
                // if the accumulated confidence is not > 10.
                //
                // But if libicu has detected something with small
                // confidence, the accumulated confidence is 10 plus
                // something small. In that case, adding something
                // around 20 seems to work reasonably well.
                //
                // I add 20 to the confidence for KOI8-R and
                // ISO-8859-5 but 21 to the confidence for
                // windows-1251 to prefer windows-1251 a little bit
                // over ISO-8859-5.
                if((*it).name() == QLatin1String("KOI8-R")
                   && koi8rConfidence > 10 && koi8rConfidence < 30)
                    (*it).setConfidence(20 + koi8rConfidence);
                else if((*it).name() == QLatin1String("ISO-8859-5")
                   && iso88595Confidence > 10 && iso88595Confidence < 30)
                    (*it).setConfidence(20 + iso88595Confidence);
                else if((*it).name() == QLatin1String("windows-1251")
                   && windows1251Confidence > 10 && windows1251Confidence < 30)
                    (*it).setConfidence(21 + windows1251Confidence);
            }
            else if ((d->_declaredLocale.contains(QLatin1String("TW"))
                || d->_declaredLocale.contains(QLatin1String("HK"))
                || d->_declaredLocale.contains(QLatin1String("MO")))
               && (*it).name() == QLatin1String("Big5")) {
                 // Traditional Chinese, Big5 more likely
                (*it).setConfidence(39);
            }
            else if ((d->_declaredLocale.contains(QLatin1String("CN"))
                     || d->_declaredLocale.contains(QLatin1String("SG"))
                     || d->_declaredLocale == QLatin1String("zh"))
                    && (*it).name() == QLatin1String("GB18030")) {
                // Simplified Chinese, GB18030/GB2312 more likely.
                // Simplified Chinese is also assumed if only “zh”
                // is set. If the variant is unknown, simplified
                // Chinese seems a bit more likely. On top of that,
                // the settings application sets only “zh” for
                // simplified Chinese and the translations for
                // simplified Chinese are also in files like
                // “foo_zh.qm” which makes simplified Chinese more
                // likely when only “zh” is set on the device (see
                // also NB#242154).
                (*it).setConfidence(39);
            }
            else {
                (*it).setConfidence(38);
            }
            sortNeeded = true;
        }
        if(!d->_allDetectableCharsets.contains((*it).name())) {
            // remove matches for charsets not supported by QTextCodec
            // then it is probably some weird charset we cannot use anyway
            it = qCharsetMatchList.erase(it);
        }
        else {
            // test whether the complete input text can be encoded
            // using this match, if not remove the match
            clearError();
            text(*it);
            if(hasError()) {
                // qMailLog(Messaging) << __PRETTY_FUNCTION__
                //          << "removing match" << (*it).name()
                //          << "because it cannot encode the complete input"
                //          << errorString();
                it = qCharsetMatchList.erase(it);
                clearError();
            }
            else
                ++it;
        }
    }
    // sort the list of matches again if confidences have been changed:
    if(sortNeeded)
        std::sort(qCharsetMatchList.begin(), qCharsetMatchList.end(),
              std::greater<QCharsetMatch>());
    if(qCharsetMatchList.isEmpty()) {
        // is there any better status to describe this case?
        d->_status = U_CE_NOT_FOUND_ERROR;
        qWarning() << __PRETTY_FUNCTION__
                 << "number of matches found=0"
                 << errorString();
        return QList<QCharsetMatch>();
    }
    return qCharsetMatchList;
}

예제 #7

0

파일 보기

파일: reader_util.cpp 프로젝트: fdelapena/easyrpg-liblcf

std::vector<std::string> ReaderUtil::DetectEncodings(const std::string& database_file) {
	std::vector<std::string> encodings;
#ifdef LCF_SUPPORT_ICU
	std::ostringstream text;

	// Populate Data::terms and Data::system or will empty by default even if load fails
	LDB_Reader::Load(database_file, "");

	text <<
	Data::terms.menu_save <<
	Data::terms.menu_quit <<
	Data::terms.new_game <<
	Data::terms.load_game <<
	Data::terms.exit_game <<
	Data::terms.status <<
	Data::terms.row <<
	Data::terms.order <<
	Data::terms.wait_on <<
	Data::terms.wait_off <<
	Data::terms.level <<
	Data::terms.health_points <<
	Data::terms.spirit_points <<
	Data::terms.normal_status <<
	Data::terms.exp_short <<
	Data::terms.lvl_short <<
	Data::terms.hp_short <<
	Data::terms.sp_short <<
	Data::terms.sp_cost <<
	Data::terms.attack <<
	Data::terms.defense <<
	Data::terms.spirit <<
	Data::terms.agility <<
	Data::terms.weapon <<
	Data::terms.shield <<
	Data::terms.armor <<
	Data::terms.helmet <<
	Data::terms.accessory <<
	Data::terms.save_game_message <<
	Data::terms.load_game_message <<
	Data::terms.file <<
	Data::terms.exit_game_message <<
	Data::terms.yes <<
	Data::terms.no <<
	Data::system.boat_name <<
	Data::system.ship_name <<
	Data::system.airship_name <<
	Data::system.title_name <<
	Data::system.gameover_name <<
	Data::system.system_name <<
	Data::system.system2_name <<
	Data::system.battletest_background <<
	Data::system.frame_name;

	if (!text.str().empty()) {
		UErrorCode status = U_ZERO_ERROR;
		UCharsetDetector* detector = ucsdet_open(&status);

		std::string s = text.str();
		ucsdet_setText(detector, s.c_str(), s.length(), &status);

		int32_t matches_count;
		const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);

		if (matches != NULL) {
			// Collect all candidates, most confident comes first
			for (int i = 0; i < matches_count; ++i) {
				std::string encoding = ucsdet_getName(matches[i], &status);

				// Fixes to ensure proper Windows encodings
				if (encoding == "Shift_JIS") {
					encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
				} else if (encoding == "EUC-KR") {
					encodings.push_back("windows-949-2000"); // Korean with \ as backlash
				} else if (encoding == "GB18030") {
					encodings.push_back("windows-936-2000"); // Simplified Chinese
				} else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
					encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
				} else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
					encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
				} else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
					encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
				} else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
					encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
				} else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
					encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
				} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
					encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
				} else {
					encodings.push_back(encoding);
				}
			}
		}
		ucsdet_close(detector);
	}
#endif

	return encodings;
}

예제 #8

0

파일 보기

파일: TextEncodingDetectorICU.cpp 프로젝트: Channely/know-your-chrome

bool detectTextEncoding(const char* data, size_t len,
                        const char* hintEncodingName,
                        TextEncoding* detectedEncoding)
{
    *detectedEncoding = TextEncoding();
    int matchesCount = 0; 
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector* detector = ucsdet_open(&status);
    if (U_FAILURE(status))
        return false;
    ucsdet_enableInputFilter(detector, true);
    ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); 
    if (U_FAILURE(status))
        return false;

    // FIXME: A few things we can do other than improving
    // the ICU detector itself. 
    // 1. Use ucsdet_detectAll and pick the most likely one given
    // "the context" (parent-encoding, referrer encoding, etc).
    // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
    // Chinese, Japanese, Russian, Korean and Hebrew) by picking the 
    // encoding with a highest confidence among the detector-specific
    // limited set of candidate encodings.
    // Below is a partial implementation of the first part of what's outlined
    // above.
    const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
    if (U_FAILURE(status)) {
        ucsdet_close(detector);
        return false;
    }

    const char* encoding = 0;
    if (hintEncodingName) {
        TextEncoding hintEncoding(hintEncodingName);
        // 10 is the minimum confidence value consistent with the codepoint
        // allocation in a given encoding. The size of a chunk passed to
        // us varies even for the same html file (apparently depending on 
        // the network load). When we're given a rather short chunk, we 
        // don't have a sufficiently reliable signal other than the fact that
        // the chunk is consistent with a set of encodings. So, instead of
        // setting an arbitrary threshold, we have to scan all the encodings
        // consistent with the data.  
        const int32_t kThresold = 10;
        for (int i = 0; i < matchesCount; ++i) {
            int32_t confidence = ucsdet_getConfidence(matches[i], &status);
            if (U_FAILURE(status)) {
                status = U_ZERO_ERROR;
                continue;
            }
            if (confidence < kThresold)
                break;
            const char* matchEncoding = ucsdet_getName(matches[i], &status);
            if (U_FAILURE(status)) {
                status = U_ZERO_ERROR;
                continue;
            }
            if (TextEncoding(matchEncoding) == hintEncoding) {
                encoding = hintEncodingName;
                break;
            }
        }
    }
    // If no match is found so far, just pick the top match. 
    // This can happen, say, when a parent frame in EUC-JP refers to
    // a child frame in Shift_JIS and both frames do NOT specify the encoding
    // making us resort to auto-detection (when it IS turned on).
    if (!encoding && matchesCount > 0)
        encoding = ucsdet_getName(matches[0], &status);
    if (U_SUCCESS(status)) {
        *detectedEncoding = TextEncoding(encoding);
        ucsdet_close(detector);
        return true;
    }    
    ucsdet_close(detector);
    return false;
}