IERServiceStringConverter *CharsetDetector::detect(const char *str) { const char *name; if((str[0] != '\0' && str[0] == '\xFF') && (str[1] != '\0' && str[1] == '\xFE')) name = "UTF-16LE"; else if((str[0] != '\0' && str[0] == '\xFE') && (str[1] != '\0' && str[1] == '\xFF')) name = "UTF-16BE"; else { UErrorCode err = U_ZERO_ERROR; ucsdet_setText(cd, str, -1, &err); if(U_FAILURE(err)) return nullptr; const UCharsetMatch *cm = ucsdet_detect(cd, &err); if(U_FAILURE(err)) return nullptr; name = ucsdet_getName(cm, &err); if(U_FAILURE(err)) return nullptr; } StringCharsetConverter *c = new StringCharsetConverter(); if(!c->setCharset(name)) { delete c; return nullptr; } return ServicePointerManager::instance().append<IERServiceStringConverter>(c); }
static QByteArray detectEncoding(const QByteArray& text) { Q_UNUSED(text); QByteArray encoding; #ifdef HAVE_ICU UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (detector && !U_FAILURE(status)) { ucsdet_setText(detector, text.constData(), text.length(), &status); if (!U_FAILURE(status)) { const UCharsetMatch* match = ucsdet_detect(detector, &status); if (match && !U_FAILURE(status)) encoding = ucsdet_getName(match, &status); } } if (U_FAILURE(status)) { qWarning("detectEncoding() failed: %s", u_errorName(status)); } ucsdet_close(detector); #endif // HAVE_ICU return encoding; }
void MCharsetDetector::setText(const QByteArray &ba) { Q_D(MCharsetDetector); clearError(); d->_ba = ba; d->_baExtended = ba; if (!ba.isEmpty()) while (d->_baExtended.size() < 50) d->_baExtended += d->_ba; ucsdet_setText(d->_uCharsetDetector, d->_baExtended.constData(), int32_t(-1), &(d->_status)); if(hasError()) qWarning() << __PRETTY_FUNCTION__ << errorString(); }
void c_EncodingDetector::t_settext(const String& text) { UErrorCode status = U_ZERO_ERROR; m_text = text; ucsdet_setText( m_encoding_detector, m_text.data(), m_text.length(), &status); if (U_FAILURE(status)) { throw Exception( "Could not set encoding detector text to [%s], error %d (%s)", text.c_str(), status, u_errorName(status)); } }
void c_EncodingDetector::t_settext(CStrRef text) { INSTANCE_METHOD_INJECTION_BUILTIN(EncodingDetector, EncodingDetector::settext); UErrorCode status = U_ZERO_ERROR; ucsdet_setText( m_encoding_detector, text.data(), text.length(), &status); if (U_FAILURE(status)) { throw Exception( "Could not set encoding detector text to [%s], error %d (%s)", text.c_str(), status, u_errorName(status)); } }
static void set_text(VALUE self, VALUE text) { if (!NIL_P(text)) { text = StringValue(text); UErrorCode status = U_ZERO_ERROR; UCharsetDetector *detector; Data_Get_Struct(self, UCharsetDetector, detector); ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status); ensure(status); UCharsetDetector_set_text(self, text); } }
QByteArray IrcMessageDecoder::codecForData(const QByteArray &data) const { QByteArray encoding; UErrorCode status = U_ZERO_ERROR; if (d.detector) { ucsdet_setText(UCSD(d.detector), data.constData(), data.length(), &status); if (!U_FAILURE(status)) { const UCharsetMatch* match = ucsdet_detect(UCSD(d.detector), &status); if (match && !U_FAILURE(status)) encoding = ucsdet_getName(match, &status); } } if (U_FAILURE(status)) qWarning("IrcMessageDecoder::codecForData() failed: %s", u_errorName(status)); return encoding; }
/* * Attempt to detect the encoding of this string * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: a dict with encoding, language, type and confidence parameters */ PyObject * charlockholmes_encoding_detect(PyObject *self, PyObject *args) { PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch *match; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); match = ucsdet_detect(ch_ucd, &status); if (match) { mname = ucsdet_getName(match, &status); mlang = ucsdet_getLanguage(match, &status); mconfidence = ucsdet_getConfidence(match, &status); if (mlang && mlang[0]) return Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else return Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); } Py_INCREF(Py_None); return Py_None; }
gchar * tracker_encoding_guess_icu (const gchar *buffer, gsize size) { UCharsetDetector *detector = NULL; const UCharsetMatch *match; gchar *charset = NULL; UErrorCode status; detector = ucsdet_open (&status); if (U_FAILURE (status)) goto failure; if (size >= G_MAXINT32) goto failure; ucsdet_setText (detector, buffer, (int32_t) size, &status); if (U_FAILURE (status)) goto failure; match = ucsdet_detect (detector, &status); if (U_FAILURE (status)) goto failure; charset = g_strdup (ucsdet_getName (match, &status)); if (U_FAILURE (status)) { g_free (charset); charset = NULL; } if (charset) g_debug ("Guessing charset as '%s'", charset); failure: if (detector) ucsdet_close (detector); return charset; }
static int _charset_detect(const char *in, size_t len, const char **charset, int *confidence) { UCharsetDetector *csd; const UCharsetMatch *ucm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error opening character set detector: %s.\n", u_errorName(status)); return -1; } ucsdet_setText(csd, in, len, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error setting text for character set detection: %s.\n", u_errorName(status)); goto error; } ucm = ucsdet_detect(csd, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: character set detection failed: %s.\n", u_errorName(status)); goto error; } *confidence = ucsdet_getConfidence(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set confidence: %s.\n", u_errorName(status)); goto error; } *charset = ucsdet_getName(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set name: %s.\n", u_errorName(status)); goto error; } return 0; error: ucsdet_close(csd); return -1; }
void QCharsetDetector::setText(const QByteArray &ba) { Q_D(QCharsetDetector); clearError(); d->_ba = ba; d->_baExtended = ba; if (!ba.isEmpty()) { while (d->_baExtended.size() < 50) d->_baExtended += d->_ba; } else { // ba is empty, possibly null. d->_ba = ""; d->_baExtended = ""; } // Workaround for libicu bug, it seems to sometimes read past end of input buffer by one byte // This was causing messageserver to abnormally terminate when running in valgrind d->_baExtended.append(char(0)); ucsdet_setText(d->_uCharsetDetector, d->_baExtended.constData(), int32_t(-1), &(d->_status)); if(hasError()) qWarning() << __PRETTY_FUNCTION__ << errorString(); }
mod_websocket_bool_t mod_websocket_conv_isUTF8(const char *data, size_t siz) { mod_websocket_bool_t ret = MOD_WEBSOCKET_FALSE; UErrorCode err = U_ZERO_ERROR; UCharsetDetector *detector = NULL; const UCharsetMatch **match; int32_t f = 0, i; const char *name; if (!data || !siz) { return MOD_WEBSOCKET_TRUE; } if (siz > INT32_MAX) { return MOD_WEBSOCKET_FALSE; } detector = ucsdet_open(&err); if (U_FAILURE(err)) { return MOD_WEBSOCKET_FALSE; } ucsdet_setText(detector, data, siz, &err); if (U_FAILURE(err)) { goto go_out; } match = ucsdet_detectAll(detector, &f, &err); if (U_FAILURE(err)) { goto go_out; } for (i = 0; i < f; i++) { name = ucsdet_getName(match[i], &err); if (strcasecmp(MOD_WEBSOCKET_UTF8_STR, name) == 0) { ret = MOD_WEBSOCKET_TRUE; break; } } go_out: ucsdet_close(detector); detector = NULL; return ret; }
std::vector<std::string> ReaderUtil::DetectEncodings(const std::string& database_file) { std::vector<std::string> encodings; #ifdef LCF_SUPPORT_ICU std::ostringstream text; // Populate Data::terms and Data::system or will empty by default even if load fails LDB_Reader::Load(database_file, ""); text << Data::terms.menu_save << Data::terms.menu_quit << Data::terms.new_game << Data::terms.load_game << Data::terms.exit_game << Data::terms.status << Data::terms.row << Data::terms.order << Data::terms.wait_on << Data::terms.wait_off << Data::terms.level << Data::terms.health_points << Data::terms.spirit_points << Data::terms.normal_status << Data::terms.exp_short << Data::terms.lvl_short << Data::terms.hp_short << Data::terms.sp_short << Data::terms.sp_cost << Data::terms.attack << Data::terms.defense << Data::terms.spirit << Data::terms.agility << Data::terms.weapon << Data::terms.shield << Data::terms.armor << Data::terms.helmet << Data::terms.accessory << Data::terms.save_game_message << Data::terms.load_game_message << Data::terms.file << Data::terms.exit_game_message << Data::terms.yes << Data::terms.no << Data::system.boat_name << Data::system.ship_name << Data::system.airship_name << Data::system.title_name << Data::system.gameover_name << Data::system.system_name << Data::system.system2_name << Data::system.battletest_background << Data::system.frame_name; if (!text.str().empty()) { UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); std::string s = text.str(); ucsdet_setText(detector, s.c_str(), s.length(), &status); int32_t matches_count; const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status); if (matches != NULL) { // Collect all candidates, most confident comes first for (int i = 0; i < matches_count; ++i) { std::string encoding = ucsdet_getName(matches[i], &status); // Fixes to ensure proper Windows encodings if (encoding == "Shift_JIS") { encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash } else if (encoding == "EUC-KR") { encodings.push_back("windows-949-2000"); // Korean with \ as backlash } else if (encoding == "GB18030") { encodings.push_back("windows-936-2000"); // Simplified Chinese } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") { encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") { encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") { encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") { encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") { encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") { encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro } else { encodings.push_back(encoding); } } } ucsdet_close(detector); } #endif return encodings; }
bool detectTextEncoding(const char* data, size_t len, const char* hintEncodingName, TextEncoding* detectedEncoding) { *detectedEncoding = TextEncoding(); int matchesCount = 0; UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (U_FAILURE(status)) return false; ucsdet_enableInputFilter(detector, true); ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); if (U_FAILURE(status)) return false; // FIXME: A few things we can do other than improving // the ICU detector itself. // 1. Use ucsdet_detectAll and pick the most likely one given // "the context" (parent-encoding, referrer encoding, etc). // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. // Chinese, Japanese, Russian, Korean and Hebrew) by picking the // encoding with a highest confidence among the detector-specific // limited set of candidate encodings. // Below is a partial implementation of the first part of what's outlined // above. const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); if (U_FAILURE(status)) { ucsdet_close(detector); return false; } const char* encoding = 0; if (hintEncodingName) { TextEncoding hintEncoding(hintEncodingName); // 10 is the minimum confidence value consistent with the codepoint // allocation in a given encoding. The size of a chunk passed to // us varies even for the same html file (apparently depending on // the network load). When we're given a rather short chunk, we // don't have a sufficiently reliable signal other than the fact that // the chunk is consistent with a set of encodings. So, instead of // setting an arbitrary threshold, we have to scan all the encodings // consistent with the data. const int32_t kThresold = 10; for (int i = 0; i < matchesCount; ++i) { int32_t confidence = ucsdet_getConfidence(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (confidence < kThresold) break; const char* matchEncoding = ucsdet_getName(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (TextEncoding(matchEncoding) == hintEncoding) { encoding = hintEncodingName; break; } } } // If no match is found so far, just pick the top match. // This can happen, say, when a parent frame in EUC-JP refers to // a child frame in Shift_JIS and both frames do NOT specify the encoding // making us resort to auto-detection (when it IS turned on). if (!encoding && matchesCount > 0) encoding = ucsdet_getName(matches[0], &status); if (U_SUCCESS(status)) { *detectedEncoding = TextEncoding(encoding); ucsdet_close(detector); return true; } ucsdet_close(detector); return false; }
std::string ReaderUtil::DetectEncoding(const std::string& database_file) { std::string encoding; #ifdef LCF_SUPPORT_ICU std::ostringstream text; //Populate Data::terms or will empty by default even if load fails LDB_Reader::Load(database_file, ""); text << Data::terms.menu_save << " " << Data::terms.menu_quit << " " << Data::terms.new_game << " " << Data::terms.load_game << " " << Data::terms.exit_game << " " << Data::terms.status << " " << Data::terms.row << " " << Data::terms.order << " " << Data::terms.wait_on << " " << Data::terms.wait_off << " " << Data::terms.level << " " << Data::terms.health_points << " " << Data::terms.spirit_points << " " << Data::terms.normal_status << " " << Data::terms.exp_short << " " << Data::terms.lvl_short << " " << Data::terms.hp_short << " " << Data::terms.sp_short << " " << Data::terms.sp_cost << " " << Data::terms.attack << " " << Data::terms.defense << " " << Data::terms.spirit << " " << Data::terms.agility << " " << Data::terms.weapon << " " << Data::terms.shield << " " << Data::terms.armor << " " << Data::terms.helmet << " " << Data::terms.accessory << " " << Data::terms.save_game_message << " " << Data::terms.load_game_message << " " << Data::terms.file << " " << Data::terms.exit_game_message << " " << Data::terms.yes << " " << Data::terms.no; // Checks if there are more than the above 33 spaces (no data) if (text.str().size() > 33) { UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); std::string s = text.str(); ucsdet_setText(detector, s.c_str(), s.length(), &status); const UCharsetMatch* match = ucsdet_detect(detector, &status); if (match != NULL) { encoding = ucsdet_getName(match, &status); } ucsdet_close(detector); // Fixes to ensure proper Windows encodings if (encoding == "Shift_JIS") { encoding = "ibm-943_P130-1999"; // Japanese with Yen backslash } else if (encoding == "EUC-KR") { encoding = "ibm-949_P110-1999"; // Korean with Won backslash } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") { encoding = "ibm-5348_P100-1997"; // Occidental with Euro } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") { encoding = "ibm-5346_P100-1998"; // Central Europe with Euro } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") { encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") { encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") { encoding = "ibm-5349_P100-1998"; // Greek with Euro } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") { encoding = "ibm-9447_P100-2002"; // Hebrew with Euro } } #endif return encoding; }
int main(int argc, char** argv) { UErrorCode e = U_ZERO_ERROR; std::string filename = argc > 1 ? argv[1] : "main.hs"; std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate); if (!file.is_open()) { std::cerr << "I can't open that file. I hate you too." << std::endl; return 1; } std::string raw; raw.reserve(file.tellg()); file.seekg(0, std::ios::beg); raw.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); file.close(); UCharsetDetector *ucd = ucsdet_open(&e); ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e); ucsdet_setText(ucd, raw.c_str(), raw.size(), &e); const UCharsetMatch *ucm = ucsdet_detect(ucd, &e); if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl; if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } UChar *buf = new UChar[raw.size() + 1]; int out = ucsdet_getUChars(ucm, buf, raw.size(), &e); if (U_FAILURE(e)) { std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl; return e; } ucsdet_close(ucd); buf[out] = 0; icu::UnicodeString source(buf); delete [] buf; source.append("\n"); std::cout << "Read:" << std::endl << source << std::endl; dhc::lexer::layout l(source); while (!l.finished()) { dhc::lexer::match_ptr token (l.next()); if (token) { std::cout << token->flatten() << ' '; } else { std::cerr << filename << std::endl; } } std::cout << std::endl; dhc::parser::parser p(source); std::cout << "Created parser" << std::endl; if (!p.finished()) { dhc::lexer::match_ptr token (p.parse()); if (token) { print_tree(token, 0); } else { std::cerr << p.error(filename) << std::endl; } } return 0; }
/* * Attempt to detect the encoding of this string, and return * a list with all the possible encodings that match it. * * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: an list with zero or more dicts * each one of them with with encoding, language, type and confidence * parameters */ PyObject * charlockholmes_encoding_detect_all(PyObject *self, PyObject *args) { PyObject *lst; PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch **matches; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; int i, match_count; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { lst = PyList_New(1); if (!lst) return NULL; content = Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); PyList_SET_ITEM(lst, 0, content); return lst; } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); matches = ucsdet_detectAll(ch_ucd, &match_count, &status); if (matches) { lst = PyList_New(match_count); if (!lst) return NULL; for (i = 0; i < match_count; ++i) { mname = ucsdet_getName(matches[i], &status); mlang = ucsdet_getLanguage(matches[i], &status); mconfidence = ucsdet_getConfidence(matches[i], &status); if (mlang && mlang[0]) content = Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else content = Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); PyList_SET_ITEM(lst, i, content); } return lst; } Py_INCREF(Py_None); return Py_None; }
UErrorCode detect_ICU(const text* buffer, text** encoding, text** lang, int32_t* confidence) { const char* cbuffer = text_to_cstring(buffer); //int cbuffer_len = strlen(cbuffer); UCharsetDetector* csd; const UCharsetMatch* csm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); // set text buffer // use -1 for string length since NUL terminated ucsdet_setText(csd, cbuffer, STRING_IS_NULL_TERMINATED, &status); //ucsdet_setText(csd, cbuffer, cbuffer_len, &status); // detect charset csm = ucsdet_detect(csd, &status); // charset match is NULL if no match if (NULL == csm) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: No charset match for \"%s\" - assuming ISO-8859-1.", cbuffer))); *encoding = cstring_to_text("ISO-8859-1"); *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } else if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: %s\n", u_errorName(status)))); *encoding = NULL; *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } *encoding = cstring_to_text(ucsdet_getName(csm, &status)); *lang = cstring_to_text(ucsdet_getLanguage(csm, &status)); *confidence = ucsdet_getConfidence(csm, &status); // close charset detector // UCharsetMatch is owned by UCharsetDetector so its memory will be // freed when the char set detector is closed ucsdet_close(csd); pfree((void *) cbuffer); return status; }