/* * The list of detectable encodings supported by this library * * Returns: an list of strings */ PyObject * charlockholmes_get_supported_encodings(PyObject *self) { UCharsetDetector *csd; UErrorCode status = U_ZERO_ERROR; UEnumeration *encoding_list; PyObject *result; int32_t enc_count; int32_t i; const char *enc_name; int32_t enc_name_len; csd = ucsdet_open(&status); encoding_list = ucsdet_getAllDetectableCharsets(csd, &status); enc_count = uenum_count(encoding_list, &status); result = PyTuple_New(enc_count); if (!result) return NULL; for(i=0; i < enc_count; i++) { enc_name = uenum_next(encoding_list, &enc_name_len, &status); PyTuple_SetItem(result, i, PyString_FromStringAndSize(enc_name, enc_name_len)); } ucsdet_close(csd); return result; }
/* * call-seq: * detectable_charsets * * Get array of names of all detectable charsets that are known to the charset detection service. */ static VALUE UCharsetDetector_get_detectable_charsets(VALUE self) { UCharsetDetector *detector; Data_Get_Struct(self, UCharsetDetector, detector); UErrorCode status = U_ZERO_ERROR; UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status); ensure(status); VALUE ary = rb_ary_new(); int32_t result_length; const char *charset_name; while (charset_name = uenum_next(charsets, &result_length, &status)) { ensure(status); rb_ary_push(ary, rb_str_new2(charset_name)); } uenum_close(charsets); return ary; }
QStringList QCharsetDetector::getAllDetectableCharsets() { Q_D(QCharsetDetector); if (!d->_allDetectableCharsets.isEmpty()) return d->_allDetectableCharsets; // Codecs supported by QTextCodec (Qt 4.7): // // ISO-2022-JP JIS7 EUC-KR GB2312 Big5 Big5-ETen CP950 GB18030 // EUC-JP Shift_JIS SJIS MS_Kanji System UTF-8 ISO-8859-1 latin1 // CP819 IBM819 iso-ir-100 csISOLatin1 ISO-8859-15 latin9 UTF-32LE // UTF-32BE UTF-32 UTF-16LE UTF-16BE UTF-16 mulelao-1 roman8 // hp-roman8 csHPRoman8 TIS-620 ISO 8859-11 WINSAMI2 WS2 Apple // Roman macintosh MacRoman windows-1258 CP1258 windows-1257 // CP1257 windows-1256 CP1256 windows-1255 CP1255 windows-1254 // CP1254 windows-1253 CP1253 windows-1252 CP1252 windows-1251 // CP1251 windows-1250 CP1250 IBM866 CP866 csIBM866 IBM874 CP874 // IBM850 CP850 csPC850Multilingual ISO-8859-16 iso-ir-226 latin10 // ISO-8859-14 iso-ir-199 latin8 iso-celtic ISO-8859-13 // ISO-8859-10 iso-ir-157 latin6 ISO-8859-10:1992 csISOLatin6 // ISO-8859-9 iso-ir-148 latin5 csISOLatin5 ISO-8859-8 ISO // 8859-8-I iso-ir-138 hebrew csISOLatinHebrew ISO-8859-7 ECMA-118 // greek iso-ir-126 csISOLatinGreek ISO-8859-6 ISO-8859-6-I // ECMA-114 ASMO-708 arabic iso-ir-127 csISOLatinArabic ISO-8859-5 // cyrillic iso-ir-144 csISOLatinCyrillic ISO-8859-4 latin4 // iso-ir-110 csISOLatin4 ISO-8859-3 latin3 iso-ir-109 csISOLatin3 // ISO-8859-2 latin2 iso-ir-101 csISOLatin2 KOI8-U KOI8-RU KOI8-R // csKOI8R Iscii-Mlm Iscii-Knd Iscii-Tlg Iscii-Tml Iscii-Ori // Iscii-Gjr Iscii-Pnj Iscii-Bng Iscii-Dev TSCII GBK gb2312.1980-0 // gbk-0 CP936 MS936 windows-936 jisx0201*-0 jisx0208*-0 // ksc5601.1987-0 cp949 Big5-HKSCS big5-0 big5hkscs-0 QStringList availableCodecsQt; foreach(const QByteArray &ba, QTextCodec::availableCodecs()) availableCodecsQt << QString::fromLatin1(ba); // Charsets detectable by libicu 4.4.2: QStringList allDetectableCharsetsICU; allDetectableCharsetsICU << QLatin1String("UTF-8") << QLatin1String("UTF-16BE") << QLatin1String("UTF-16LE") << QLatin1String("UTF-32BE") << QLatin1String("UTF-32LE") << QLatin1String("ISO-8859-1") << QLatin1String("ISO-8859-2") << QLatin1String("ISO-8859-5") << QLatin1String("ISO-8859-6") << QLatin1String("ISO-8859-7") << QLatin1String("ISO-8859-8-I") << QLatin1String("ISO-8859-8") << QLatin1String("ISO-8859-9") << QLatin1String("KOI8-R") << QLatin1String("Shift_JIS") << QLatin1String("GB18030") << QLatin1String("EUC-JP") << QLatin1String("EUC-KR") << QLatin1String("Big5") << QLatin1String("ISO-2022-JP") << QLatin1String("ISO-2022-KR") << QLatin1String("ISO-2022-CN") << QLatin1String("IBM424_rtl") << QLatin1String("IBM424_ltr") << QLatin1String("IBM420_rtl") << QLatin1String("IBM420_ltr") << QLatin1String("windows-1250") << QLatin1String("windows-1251") << QLatin1String("windows-1252") << QLatin1String("windows-1253") << QLatin1String("windows-1255") << QLatin1String("windows-1256") << QLatin1String("windows-1254"); // The charsets detectable by libicu can be determined by // ucsdet_getAllDetectableCharsets() and the documentation for // that function at // http://icu-project.org/apiref/icu4c/ucsdet_8h.html says: // // “The state of the Charset detector that is passed in does // not affect the result of this function, but requiring a // valid, open charset detector as a parameter insures that // the charset detection service has been safely initialized // and that the required detection data is available.” // // but that does not seem to be completely true, in fact it // *does* depend on the state of the charset detector. For example // sometimes "windows-1250" *is* among the returned charsets. // This happens if some non-ASCII text // is in the detector and a detection is attempted and *then* // ucsdet_getAllDetectableCharsets() is called. // And sometimes "windows-1250" is *not* among the returned // charsets. This happens when an empty charset detector is created // and then ucsdet_getAllDetectableCharsets() is called. // If ucsdet_getAllDetectableCharsets() has been called once // the list of returned charsets never seems to change anymore, // even if the text in the detector is changed again and // another detection attempted which would result in a different // list if ucsdet_getAllDetectableCharsets() were called first // in that state. // // Sometimes ucsdet_getAllDetectableCharsets() reports charsets // multiple times depending on the number of languages it can // detect for that charsets, i.e. it may report ISO-8859-2 four // times because it can detect the languages “cs”, “hu”, // “pl”, and “ro” with that charset. // // This looks like a bug to me, to get a reliable list, // I have hardcoded the complete list of charsets which // ucsdet_getAllDetectableCharsets() can possibly return // for all states of the detector above. // // Therefore, the following code should not any extra charsets // anymore, at least not for libicu 4.4.2: clearError(); UEnumeration *en = ucsdet_getAllDetectableCharsets(d->_uCharsetDetector, &(d->_status)); if (!hasError()) { qint32 len; const UChar *uc; while ((uc = uenum_unext(en, &len, &(d->_status))) != NULL) { if(uc && !hasError()) allDetectableCharsetsICU << QString::fromUtf16(uc, len); } } uenum_close(en); // remove all charsets not supported by QTextCodec and all duplicates: foreach(const QString &cs, allDetectableCharsetsICU) { if(availableCodecsQt.contains(cs) && !d->_allDetectableCharsets.contains(cs)) d->_allDetectableCharsets << cs; } std::sort(d->_allDetectableCharsets.begin(), d->_allDetectableCharsets.end()); return d->_allDetectableCharsets; }