/*
 * The list of detectable encodings supported by this library
 *
 * Returns: an list of strings
 */
PyObject *
charlockholmes_get_supported_encodings(PyObject *self)
{
	UCharsetDetector *csd;
	UErrorCode status = U_ZERO_ERROR;
	UEnumeration *encoding_list;
    PyObject *result;
	int32_t enc_count;
	int32_t i;
	const char *enc_name;
	int32_t enc_name_len;

    csd = ucsdet_open(&status);
    encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
    enc_count = uenum_count(encoding_list, &status);

    result = PyTuple_New(enc_count);
    if (!result)
        return NULL;

    for(i=0; i < enc_count; i++) {
        enc_name = uenum_next(encoding_list, &enc_name_len, &status);
        PyTuple_SetItem(result, i, PyString_FromStringAndSize(enc_name, enc_name_len));
    }
    ucsdet_close(csd);

    return result;
}
Example #2
0
/*
 * call-seq:
 *   detectable_charsets
 *
 * Get array of names of all detectable charsets that are known to the charset detection service.
 */
static VALUE
UCharsetDetector_get_detectable_charsets(VALUE self)
{
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    UErrorCode status = U_ZERO_ERROR;
    
    UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
    ensure(status);
    
    VALUE ary = rb_ary_new();
    int32_t result_length;
    const char *charset_name;
    
    while (charset_name = uenum_next(charsets, &result_length, &status)) {
        ensure(status);
        rb_ary_push(ary, rb_str_new2(charset_name));
    }
    uenum_close(charsets);
    
    return ary;
}
QStringList QCharsetDetector::getAllDetectableCharsets()
{
    Q_D(QCharsetDetector);

    if (!d->_allDetectableCharsets.isEmpty())
        return d->_allDetectableCharsets;

    // Codecs supported by QTextCodec (Qt 4.7):
    //
    // ISO-2022-JP JIS7 EUC-KR GB2312 Big5 Big5-ETen CP950 GB18030
    // EUC-JP Shift_JIS SJIS MS_Kanji System UTF-8 ISO-8859-1 latin1
    // CP819 IBM819 iso-ir-100 csISOLatin1 ISO-8859-15 latin9 UTF-32LE
    // UTF-32BE UTF-32 UTF-16LE UTF-16BE UTF-16 mulelao-1 roman8
    // hp-roman8 csHPRoman8 TIS-620 ISO 8859-11 WINSAMI2 WS2 Apple
    // Roman macintosh MacRoman windows-1258 CP1258 windows-1257
    // CP1257 windows-1256 CP1256 windows-1255 CP1255 windows-1254
    // CP1254 windows-1253 CP1253 windows-1252 CP1252 windows-1251
    // CP1251 windows-1250 CP1250 IBM866 CP866 csIBM866 IBM874 CP874
    // IBM850 CP850 csPC850Multilingual ISO-8859-16 iso-ir-226 latin10
    // ISO-8859-14 iso-ir-199 latin8 iso-celtic ISO-8859-13
    // ISO-8859-10 iso-ir-157 latin6 ISO-8859-10:1992 csISOLatin6
    // ISO-8859-9 iso-ir-148 latin5 csISOLatin5 ISO-8859-8 ISO
    // 8859-8-I iso-ir-138 hebrew csISOLatinHebrew ISO-8859-7 ECMA-118
    // greek iso-ir-126 csISOLatinGreek ISO-8859-6 ISO-8859-6-I
    // ECMA-114 ASMO-708 arabic iso-ir-127 csISOLatinArabic ISO-8859-5
    // cyrillic iso-ir-144 csISOLatinCyrillic ISO-8859-4 latin4
    // iso-ir-110 csISOLatin4 ISO-8859-3 latin3 iso-ir-109 csISOLatin3
    // ISO-8859-2 latin2 iso-ir-101 csISOLatin2 KOI8-U KOI8-RU KOI8-R
    // csKOI8R Iscii-Mlm Iscii-Knd Iscii-Tlg Iscii-Tml Iscii-Ori
    // Iscii-Gjr Iscii-Pnj Iscii-Bng Iscii-Dev TSCII GBK gb2312.1980-0
    // gbk-0 CP936 MS936 windows-936 jisx0201*-0 jisx0208*-0
    // ksc5601.1987-0 cp949 Big5-HKSCS big5-0 big5hkscs-0

    QStringList availableCodecsQt;
    foreach(const QByteArray &ba, QTextCodec::availableCodecs())
        availableCodecsQt << QString::fromLatin1(ba);

    // Charsets detectable by libicu 4.4.2:
    QStringList allDetectableCharsetsICU;
    allDetectableCharsetsICU
    << QLatin1String("UTF-8")
    << QLatin1String("UTF-16BE")
    << QLatin1String("UTF-16LE")
    << QLatin1String("UTF-32BE")
    << QLatin1String("UTF-32LE")
    << QLatin1String("ISO-8859-1")
    << QLatin1String("ISO-8859-2")
    << QLatin1String("ISO-8859-5")
    << QLatin1String("ISO-8859-6")
    << QLatin1String("ISO-8859-7")
    << QLatin1String("ISO-8859-8-I")
    << QLatin1String("ISO-8859-8")
    << QLatin1String("ISO-8859-9")
    << QLatin1String("KOI8-R")
    << QLatin1String("Shift_JIS")
    << QLatin1String("GB18030")
    << QLatin1String("EUC-JP")
    << QLatin1String("EUC-KR")
    << QLatin1String("Big5")
    << QLatin1String("ISO-2022-JP")
    << QLatin1String("ISO-2022-KR")
    << QLatin1String("ISO-2022-CN")
    << QLatin1String("IBM424_rtl")
    << QLatin1String("IBM424_ltr")
    << QLatin1String("IBM420_rtl")
    << QLatin1String("IBM420_ltr")
    << QLatin1String("windows-1250")
    << QLatin1String("windows-1251")
    << QLatin1String("windows-1252")
    << QLatin1String("windows-1253")
    << QLatin1String("windows-1255")
    << QLatin1String("windows-1256")
    << QLatin1String("windows-1254");

    // The charsets detectable by libicu can be determined by
    // ucsdet_getAllDetectableCharsets() and the documentation for
    // that function at
    // http://icu-project.org/apiref/icu4c/ucsdet_8h.html says:
    //
    //     “The state of the Charset detector that is passed in does
    //     not affect the result of this function, but requiring a
    //     valid, open charset detector as a parameter insures that
    //     the charset detection service has been safely initialized
    //     and that the required detection data is available.”
    //
    // but that does not seem to be completely true, in fact it
    // *does* depend on the state of the charset detector. For example
    // sometimes "windows-1250" *is* among the returned charsets.
    // This happens if some non-ASCII text
    // is in the detector and a detection is attempted and *then*
    // ucsdet_getAllDetectableCharsets() is called.
    // And sometimes "windows-1250" is *not* among the returned
    // charsets. This happens when an empty charset detector is created
    // and then ucsdet_getAllDetectableCharsets() is called.
    // If ucsdet_getAllDetectableCharsets() has been called once
    // the list of returned charsets never seems to change anymore,
    // even if the text in the detector is changed again and
    // another detection attempted which would result in a different
    // list if ucsdet_getAllDetectableCharsets() were called first
    // in that state.
    //
    // Sometimes ucsdet_getAllDetectableCharsets() reports charsets
    // multiple times depending on the number of languages it can
    // detect for that charsets, i.e. it may report ISO-8859-2 four
    // times because it can detect the languages “cs”, “hu”,
    // “pl”, and “ro” with that charset.
    //
    // This looks like a bug to me, to get a reliable list,
    // I have hardcoded the complete list of charsets which
    // ucsdet_getAllDetectableCharsets() can possibly return
    // for all states of the detector above.
    //
    // Therefore, the following code should not any extra charsets
    // anymore, at least not for libicu 4.4.2:
    clearError();
    UEnumeration *en =
        ucsdet_getAllDetectableCharsets(d->_uCharsetDetector, &(d->_status));
    if (!hasError()) {
        qint32 len;
        const UChar *uc;
        while ((uc = uenum_unext(en, &len, &(d->_status))) != NULL) {
            if(uc && !hasError())
                allDetectableCharsetsICU << QString::fromUtf16(uc, len);
        }
    }
    uenum_close(en);

    // remove all charsets not supported by QTextCodec and all duplicates:
    foreach(const QString &cs, allDetectableCharsetsICU) {
        if(availableCodecsQt.contains(cs) && !d->_allDetectableCharsets.contains(cs))
            d->_allDetectableCharsets << cs;
    }

    std::sort(d->_allDetectableCharsets.begin(), d->_allDetectableCharsets.end());

    return d->_allDetectableCharsets;
}