static QByteArray detectEncoding(const QByteArray& text) { Q_UNUSED(text); QByteArray encoding; #ifdef HAVE_ICU UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (detector && !U_FAILURE(status)) { ucsdet_setText(detector, text.constData(), text.length(), &status); if (!U_FAILURE(status)) { const UCharsetMatch* match = ucsdet_detect(detector, &status); if (match && !U_FAILURE(status)) encoding = ucsdet_getName(match, &status); } } if (U_FAILURE(status)) { qWarning("detectEncoding() failed: %s", u_errorName(status)); } ucsdet_close(detector); #endif // HAVE_ICU return encoding; }
/* * The list of detectable encodings supported by this library * * Returns: an list of strings */ PyObject * charlockholmes_get_supported_encodings(PyObject *self) { UCharsetDetector *csd; UErrorCode status = U_ZERO_ERROR; UEnumeration *encoding_list; PyObject *result; int32_t enc_count; int32_t i; const char *enc_name; int32_t enc_name_len; csd = ucsdet_open(&status); encoding_list = ucsdet_getAllDetectableCharsets(csd, &status); enc_count = uenum_count(encoding_list, &status); result = PyTuple_New(enc_count); if (!result) return NULL; for(i=0; i < enc_count; i++) { enc_name = uenum_next(encoding_list, &enc_name_len, &status); PyTuple_SetItem(result, i, PyString_FromStringAndSize(enc_name, enc_name_len)); } ucsdet_close(csd); return result; }
gchar * tracker_encoding_guess_icu (const gchar *buffer, gsize size) { UCharsetDetector *detector = NULL; const UCharsetMatch *match; gchar *charset = NULL; UErrorCode status; detector = ucsdet_open (&status); if (U_FAILURE (status)) goto failure; if (size >= G_MAXINT32) goto failure; ucsdet_setText (detector, buffer, (int32_t) size, &status); if (U_FAILURE (status)) goto failure; match = ucsdet_detect (detector, &status); if (U_FAILURE (status)) goto failure; charset = g_strdup (ucsdet_getName (match, &status)); if (U_FAILURE (status)) { g_free (charset); charset = NULL; } if (charset) g_debug ("Guessing charset as '%s'", charset); failure: if (detector) ucsdet_close (detector); return charset; }
static int _charset_detect(const char *in, size_t len, const char **charset, int *confidence) { UCharsetDetector *csd; const UCharsetMatch *ucm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error opening character set detector: %s.\n", u_errorName(status)); return -1; } ucsdet_setText(csd, in, len, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error setting text for character set detection: %s.\n", u_errorName(status)); goto error; } ucm = ucsdet_detect(csd, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: character set detection failed: %s.\n", u_errorName(status)); goto error; } *confidence = ucsdet_getConfidence(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set confidence: %s.\n", u_errorName(status)); goto error; } *charset = ucsdet_getName(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set name: %s.\n", u_errorName(status)); goto error; } return 0; error: ucsdet_close(csd); return -1; }
mod_websocket_bool_t mod_websocket_conv_isUTF8(const char *data, size_t siz) { mod_websocket_bool_t ret = MOD_WEBSOCKET_FALSE; UErrorCode err = U_ZERO_ERROR; UCharsetDetector *detector = NULL; const UCharsetMatch **match; int32_t f = 0, i; const char *name; if (!data || !siz) { return MOD_WEBSOCKET_TRUE; } if (siz > INT32_MAX) { return MOD_WEBSOCKET_FALSE; } detector = ucsdet_open(&err); if (U_FAILURE(err)) { return MOD_WEBSOCKET_FALSE; } ucsdet_setText(detector, data, siz, &err); if (U_FAILURE(err)) { goto go_out; } match = ucsdet_detectAll(detector, &f, &err); if (U_FAILURE(err)) { goto go_out; } for (i = 0; i < f; i++) { name = ucsdet_getName(match[i], &err); if (strcasecmp(MOD_WEBSOCKET_UTF8_STR, name) == 0) { ret = MOD_WEBSOCKET_TRUE; break; } } go_out: ucsdet_close(detector); detector = NULL; return ret; }
QCharsetDetectorPrivate::~QCharsetDetectorPrivate() { ucsdet_close(_uCharsetDetector); }
std::vector<std::string> ReaderUtil::DetectEncodings(const std::string& database_file) { std::vector<std::string> encodings; #ifdef LCF_SUPPORT_ICU std::ostringstream text; // Populate Data::terms and Data::system or will empty by default even if load fails LDB_Reader::Load(database_file, ""); text << Data::terms.menu_save << Data::terms.menu_quit << Data::terms.new_game << Data::terms.load_game << Data::terms.exit_game << Data::terms.status << Data::terms.row << Data::terms.order << Data::terms.wait_on << Data::terms.wait_off << Data::terms.level << Data::terms.health_points << Data::terms.spirit_points << Data::terms.normal_status << Data::terms.exp_short << Data::terms.lvl_short << Data::terms.hp_short << Data::terms.sp_short << Data::terms.sp_cost << Data::terms.attack << Data::terms.defense << Data::terms.spirit << Data::terms.agility << Data::terms.weapon << Data::terms.shield << Data::terms.armor << Data::terms.helmet << Data::terms.accessory << Data::terms.save_game_message << Data::terms.load_game_message << Data::terms.file << Data::terms.exit_game_message << Data::terms.yes << Data::terms.no << Data::system.boat_name << Data::system.ship_name << Data::system.airship_name << Data::system.title_name << Data::system.gameover_name << Data::system.system_name << Data::system.system2_name << Data::system.battletest_background << Data::system.frame_name; if (!text.str().empty()) { UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); std::string s = text.str(); ucsdet_setText(detector, s.c_str(), s.length(), &status); int32_t matches_count; const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status); if (matches != NULL) { // Collect all candidates, most confident comes first for (int i = 0; i < matches_count; ++i) { std::string encoding = ucsdet_getName(matches[i], &status); // Fixes to ensure proper Windows encodings if (encoding == "Shift_JIS") { encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash } else if (encoding == "EUC-KR") { encodings.push_back("windows-949-2000"); // Korean with \ as backlash } else if (encoding == "GB18030") { encodings.push_back("windows-936-2000"); // Simplified Chinese } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") { encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") { encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") { encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") { encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") { encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") { encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro } else { encodings.push_back(encoding); } } } ucsdet_close(detector); } #endif return encodings; }
bool detectTextEncoding(const char* data, size_t len, const char* hintEncodingName, TextEncoding* detectedEncoding) { *detectedEncoding = TextEncoding(); int matchesCount = 0; UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (U_FAILURE(status)) return false; ucsdet_enableInputFilter(detector, true); ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); if (U_FAILURE(status)) return false; // FIXME: A few things we can do other than improving // the ICU detector itself. // 1. Use ucsdet_detectAll and pick the most likely one given // "the context" (parent-encoding, referrer encoding, etc). // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. // Chinese, Japanese, Russian, Korean and Hebrew) by picking the // encoding with a highest confidence among the detector-specific // limited set of candidate encodings. // Below is a partial implementation of the first part of what's outlined // above. const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); if (U_FAILURE(status)) { ucsdet_close(detector); return false; } const char* encoding = 0; if (hintEncodingName) { TextEncoding hintEncoding(hintEncodingName); // 10 is the minimum confidence value consistent with the codepoint // allocation in a given encoding. The size of a chunk passed to // us varies even for the same html file (apparently depending on // the network load). When we're given a rather short chunk, we // don't have a sufficiently reliable signal other than the fact that // the chunk is consistent with a set of encodings. So, instead of // setting an arbitrary threshold, we have to scan all the encodings // consistent with the data. const int32_t kThresold = 10; for (int i = 0; i < matchesCount; ++i) { int32_t confidence = ucsdet_getConfidence(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (confidence < kThresold) break; const char* matchEncoding = ucsdet_getName(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (TextEncoding(matchEncoding) == hintEncoding) { encoding = hintEncodingName; break; } } } // If no match is found so far, just pick the top match. // This can happen, say, when a parent frame in EUC-JP refers to // a child frame in Shift_JIS and both frames do NOT specify the encoding // making us resort to auto-detection (when it IS turned on). if (!encoding && matchesCount > 0) encoding = ucsdet_getName(matches[0], &status); if (U_SUCCESS(status)) { *detectedEncoding = TextEncoding(encoding); ucsdet_close(detector); return true; } ucsdet_close(detector); return false; }
std::string ReaderUtil::DetectEncoding(const std::string& database_file) { std::string encoding; #ifdef LCF_SUPPORT_ICU std::ostringstream text; //Populate Data::terms or will empty by default even if load fails LDB_Reader::Load(database_file, ""); text << Data::terms.menu_save << " " << Data::terms.menu_quit << " " << Data::terms.new_game << " " << Data::terms.load_game << " " << Data::terms.exit_game << " " << Data::terms.status << " " << Data::terms.row << " " << Data::terms.order << " " << Data::terms.wait_on << " " << Data::terms.wait_off << " " << Data::terms.level << " " << Data::terms.health_points << " " << Data::terms.spirit_points << " " << Data::terms.normal_status << " " << Data::terms.exp_short << " " << Data::terms.lvl_short << " " << Data::terms.hp_short << " " << Data::terms.sp_short << " " << Data::terms.sp_cost << " " << Data::terms.attack << " " << Data::terms.defense << " " << Data::terms.spirit << " " << Data::terms.agility << " " << Data::terms.weapon << " " << Data::terms.shield << " " << Data::terms.armor << " " << Data::terms.helmet << " " << Data::terms.accessory << " " << Data::terms.save_game_message << " " << Data::terms.load_game_message << " " << Data::terms.file << " " << Data::terms.exit_game_message << " " << Data::terms.yes << " " << Data::terms.no; // Checks if there are more than the above 33 spaces (no data) if (text.str().size() > 33) { UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); std::string s = text.str(); ucsdet_setText(detector, s.c_str(), s.length(), &status); const UCharsetMatch* match = ucsdet_detect(detector, &status); if (match != NULL) { encoding = ucsdet_getName(match, &status); } ucsdet_close(detector); // Fixes to ensure proper Windows encodings if (encoding == "Shift_JIS") { encoding = "ibm-943_P130-1999"; // Japanese with Yen backslash } else if (encoding == "EUC-KR") { encoding = "ibm-949_P110-1999"; // Korean with Won backslash } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") { encoding = "ibm-5348_P100-1997"; // Occidental with Euro } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") { encoding = "ibm-5346_P100-1998"; // Central Europe with Euro } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") { encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") { encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") { encoding = "ibm-5349_P100-1998"; // Greek with Euro } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") { encoding = "ibm-9447_P100-2002"; // Hebrew with Euro } } #endif return encoding; }
UErrorCode detect_ICU(const text* buffer, text** encoding, text** lang, int32_t* confidence) { const char* cbuffer = text_to_cstring(buffer); //int cbuffer_len = strlen(cbuffer); UCharsetDetector* csd; const UCharsetMatch* csm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); // set text buffer // use -1 for string length since NUL terminated ucsdet_setText(csd, cbuffer, STRING_IS_NULL_TERMINATED, &status); //ucsdet_setText(csd, cbuffer, cbuffer_len, &status); // detect charset csm = ucsdet_detect(csd, &status); // charset match is NULL if no match if (NULL == csm) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: No charset match for \"%s\" - assuming ISO-8859-1.", cbuffer))); *encoding = cstring_to_text("ISO-8859-1"); *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } else if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: %s\n", u_errorName(status)))); *encoding = NULL; *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } *encoding = cstring_to_text(ucsdet_getName(csm, &status)); *lang = cstring_to_text(ucsdet_getLanguage(csm, &status)); *confidence = ucsdet_getConfidence(csm, &status); // close charset detector // UCharsetMatch is owned by UCharsetDetector so its memory will be // freed when the char set detector is closed ucsdet_close(csd); pfree((void *) cbuffer); return status; }
static void UCharsetDetector_free(void *detector) { ucsdet_close(detector); }
CharsetDetector::~CharsetDetector() { if(cd != nullptr) ucsdet_close(cd); }
c_EncodingDetector::~c_EncodingDetector() { ucsdet_close(m_encoding_detector); }
int main(int argc, char** argv) { UErrorCode e = U_ZERO_ERROR; std::string filename = argc > 1 ? argv[1] : "main.hs"; std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate); if (!file.is_open()) { std::cerr << "I can't open that file. I hate you too." << std::endl; return 1; } std::string raw; raw.reserve(file.tellg()); file.seekg(0, std::ios::beg); raw.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); file.close(); UCharsetDetector *ucd = ucsdet_open(&e); ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e); ucsdet_setText(ucd, raw.c_str(), raw.size(), &e); const UCharsetMatch *ucm = ucsdet_detect(ucd, &e); if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl; if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } UChar *buf = new UChar[raw.size() + 1]; int out = ucsdet_getUChars(ucm, buf, raw.size(), &e); if (U_FAILURE(e)) { std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl; return e; } ucsdet_close(ucd); buf[out] = 0; icu::UnicodeString source(buf); delete [] buf; source.append("\n"); std::cout << "Read:" << std::endl << source << std::endl; dhc::lexer::layout l(source); while (!l.finished()) { dhc::lexer::match_ptr token (l.next()); if (token) { std::cout << token->flatten() << ' '; } else { std::cerr << filename << std::endl; } } std::cout << std::endl; dhc::parser::parser p(source); std::cout << "Created parser" << std::endl; if (!p.finished()) { dhc::lexer::match_ptr token (p.parse()); if (token) { print_tree(token, 0); } else { std::cerr << p.error(filename) << std::endl; } } return 0; }
void IrcMessageDecoder::uninitialize() { ucsdet_close(UCSD(d.detector)); }