/* * call-seq: * detect(text=nil, declared_encoding=nil) * * Return the charset that best matches the supplied input data. * * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. * * The function will fail if * * no charset appears to match the data * * no input text has been provided (with +text+ or set with #text= ) */ static VALUE UCharsetDetector_detect(int argc, VALUE *argv, VALUE self) { VALUE text; VALUE declared_encoding; rb_scan_args(argc, argv, "02", &text, &declared_encoding); set_text(self, text); set_declared_encoding(self, declared_encoding); UErrorCode status = U_ZERO_ERROR; UCharsetDetector *detector; Data_Get_Struct(self, UCharsetDetector, detector); const UCharsetMatch *match = ucsdet_detect(detector, &status); ensure(status); const char *encoding_name = ucsdet_getName(match, &status); ensure(status); int32_t encoding_confidence = ucsdet_getConfidence(match, &status); ensure(status); const char *encoding_language = ucsdet_getLanguage(match, &status); ensure(status); VALUE hash = rb_hash_new(); rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name)); rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence)); rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language)); return hash; }
static QByteArray detectEncoding(const QByteArray& text) { Q_UNUSED(text); QByteArray encoding; #ifdef HAVE_ICU UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (detector && !U_FAILURE(status)) { ucsdet_setText(detector, text.constData(), text.length(), &status); if (!U_FAILURE(status)) { const UCharsetMatch* match = ucsdet_detect(detector, &status); if (match && !U_FAILURE(status)) encoding = ucsdet_getName(match, &status); } } if (U_FAILURE(status)) { qWarning("detectEncoding() failed: %s", u_errorName(status)); } ucsdet_close(detector); #endif // HAVE_ICU return encoding; }
IERServiceStringConverter *CharsetDetector::detect(const char *str) { const char *name; if((str[0] != '\0' && str[0] == '\xFF') && (str[1] != '\0' && str[1] == '\xFE')) name = "UTF-16LE"; else if((str[0] != '\0' && str[0] == '\xFE') && (str[1] != '\0' && str[1] == '\xFF')) name = "UTF-16BE"; else { UErrorCode err = U_ZERO_ERROR; ucsdet_setText(cd, str, -1, &err); if(U_FAILURE(err)) return nullptr; const UCharsetMatch *cm = ucsdet_detect(cd, &err); if(U_FAILURE(err)) return nullptr; name = ucsdet_getName(cm, &err); if(U_FAILURE(err)) return nullptr; } StringCharsetConverter *c = new StringCharsetConverter(); if(!c->setCharset(name)) { delete c; return nullptr; } return ServicePointerManager::instance().append<IERServiceStringConverter>(c); }
static Object HHVM_METHOD(EncodingDetector, detect) { FETCH_DET(data, this_); UErrorCode error = U_ZERO_ERROR; auto match = ucsdet_detect(data->detector(), &error); if (U_FAILURE(error)) { data->throwException("Could not detect encoding, error %d (%s)", error, u_errorName(error)); } return EncodingMatch::newInstance(match); }
Object c_EncodingDetector::t_detect() { UErrorCode status = U_ZERO_ERROR; const UCharsetMatch* match = ucsdet_detect( m_encoding_detector, &status); if (U_FAILURE(status)) { throw Exception( "Could not detect encoding, error %d (%s)", status, u_errorName(status)); } p_EncodingMatch matchobj = NEWOBJ(c_EncodingMatch)(); matchobj->m_encoding_match = match; return matchobj; }
QByteArray IrcMessageDecoder::codecForData(const QByteArray &data) const { QByteArray encoding; UErrorCode status = U_ZERO_ERROR; if (d.detector) { ucsdet_setText(UCSD(d.detector), data.constData(), data.length(), &status); if (!U_FAILURE(status)) { const UCharsetMatch* match = ucsdet_detect(UCSD(d.detector), &status); if (match && !U_FAILURE(status)) encoding = ucsdet_getName(match, &status); } } if (U_FAILURE(status)) qWarning("IrcMessageDecoder::codecForData() failed: %s", u_errorName(status)); return encoding; }
/* * Attempt to detect the encoding of this string * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: a dict with encoding, language, type and confidence parameters */ PyObject * charlockholmes_encoding_detect(PyObject *self, PyObject *args) { PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch *match; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); match = ucsdet_detect(ch_ucd, &status); if (match) { mname = ucsdet_getName(match, &status); mlang = ucsdet_getLanguage(match, &status); mconfidence = ucsdet_getConfidence(match, &status); if (mlang && mlang[0]) return Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else return Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); } Py_INCREF(Py_None); return Py_None; }
gchar * tracker_encoding_guess_icu (const gchar *buffer, gsize size) { UCharsetDetector *detector = NULL; const UCharsetMatch *match; gchar *charset = NULL; UErrorCode status; detector = ucsdet_open (&status); if (U_FAILURE (status)) goto failure; if (size >= G_MAXINT32) goto failure; ucsdet_setText (detector, buffer, (int32_t) size, &status); if (U_FAILURE (status)) goto failure; match = ucsdet_detect (detector, &status); if (U_FAILURE (status)) goto failure; charset = g_strdup (ucsdet_getName (match, &status)); if (U_FAILURE (status)) { g_free (charset); charset = NULL; } if (charset) g_debug ("Guessing charset as '%s'", charset); failure: if (detector) ucsdet_close (detector); return charset; }
static int _charset_detect(const char *in, size_t len, const char **charset, int *confidence) { UCharsetDetector *csd; const UCharsetMatch *ucm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error opening character set detector: %s.\n", u_errorName(status)); return -1; } ucsdet_setText(csd, in, len, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error setting text for character set detection: %s.\n", u_errorName(status)); goto error; } ucm = ucsdet_detect(csd, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: character set detection failed: %s.\n", u_errorName(status)); goto error; } *confidence = ucsdet_getConfidence(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set confidence: %s.\n", u_errorName(status)); goto error; } *charset = ucsdet_getName(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set name: %s.\n", u_errorName(status)); goto error; } return 0; error: ucsdet_close(csd); return -1; }
std::string ReaderUtil::DetectEncoding(const std::string& database_file) { std::string encoding; #ifdef LCF_SUPPORT_ICU std::ostringstream text; //Populate Data::terms or will empty by default even if load fails LDB_Reader::Load(database_file, ""); text << Data::terms.menu_save << " " << Data::terms.menu_quit << " " << Data::terms.new_game << " " << Data::terms.load_game << " " << Data::terms.exit_game << " " << Data::terms.status << " " << Data::terms.row << " " << Data::terms.order << " " << Data::terms.wait_on << " " << Data::terms.wait_off << " " << Data::terms.level << " " << Data::terms.health_points << " " << Data::terms.spirit_points << " " << Data::terms.normal_status << " " << Data::terms.exp_short << " " << Data::terms.lvl_short << " " << Data::terms.hp_short << " " << Data::terms.sp_short << " " << Data::terms.sp_cost << " " << Data::terms.attack << " " << Data::terms.defense << " " << Data::terms.spirit << " " << Data::terms.agility << " " << Data::terms.weapon << " " << Data::terms.shield << " " << Data::terms.armor << " " << Data::terms.helmet << " " << Data::terms.accessory << " " << Data::terms.save_game_message << " " << Data::terms.load_game_message << " " << Data::terms.file << " " << Data::terms.exit_game_message << " " << Data::terms.yes << " " << Data::terms.no; // Checks if there are more than the above 33 spaces (no data) if (text.str().size() > 33) { UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); std::string s = text.str(); ucsdet_setText(detector, s.c_str(), s.length(), &status); const UCharsetMatch* match = ucsdet_detect(detector, &status); if (match != NULL) { encoding = ucsdet_getName(match, &status); } ucsdet_close(detector); // Fixes to ensure proper Windows encodings if (encoding == "Shift_JIS") { encoding = "ibm-943_P130-1999"; // Japanese with Yen backslash } else if (encoding == "EUC-KR") { encoding = "ibm-949_P110-1999"; // Korean with Won backslash } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") { encoding = "ibm-5348_P100-1997"; // Occidental with Euro } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") { encoding = "ibm-5346_P100-1998"; // Central Europe with Euro } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") { encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") { encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") { encoding = "ibm-5349_P100-1998"; // Greek with Euro } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") { encoding = "ibm-9447_P100-2002"; // Hebrew with Euro } } #endif return encoding; }
UErrorCode detect_ICU(const text* buffer, text** encoding, text** lang, int32_t* confidence) { const char* cbuffer = text_to_cstring(buffer); //int cbuffer_len = strlen(cbuffer); UCharsetDetector* csd; const UCharsetMatch* csm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); // set text buffer // use -1 for string length since NUL terminated ucsdet_setText(csd, cbuffer, STRING_IS_NULL_TERMINATED, &status); //ucsdet_setText(csd, cbuffer, cbuffer_len, &status); // detect charset csm = ucsdet_detect(csd, &status); // charset match is NULL if no match if (NULL == csm) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: No charset match for \"%s\" - assuming ISO-8859-1.", cbuffer))); *encoding = cstring_to_text("ISO-8859-1"); *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } else if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: %s\n", u_errorName(status)))); *encoding = NULL; *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } *encoding = cstring_to_text(ucsdet_getName(csm, &status)); *lang = cstring_to_text(ucsdet_getLanguage(csm, &status)); *confidence = ucsdet_getConfidence(csm, &status); // close charset detector // UCharsetMatch is owned by UCharsetDetector so its memory will be // freed when the char set detector is closed ucsdet_close(csd); pfree((void *) cbuffer); return status; }
int main(int argc, char** argv) { UErrorCode e = U_ZERO_ERROR; std::string filename = argc > 1 ? argv[1] : "main.hs"; std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate); if (!file.is_open()) { std::cerr << "I can't open that file. I hate you too." << std::endl; return 1; } std::string raw; raw.reserve(file.tellg()); file.seekg(0, std::ios::beg); raw.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); file.close(); UCharsetDetector *ucd = ucsdet_open(&e); ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e); ucsdet_setText(ucd, raw.c_str(), raw.size(), &e); const UCharsetMatch *ucm = ucsdet_detect(ucd, &e); if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl; if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } UChar *buf = new UChar[raw.size() + 1]; int out = ucsdet_getUChars(ucm, buf, raw.size(), &e); if (U_FAILURE(e)) { std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl; return e; } ucsdet_close(ucd); buf[out] = 0; icu::UnicodeString source(buf); delete [] buf; source.append("\n"); std::cout << "Read:" << std::endl << source << std::endl; dhc::lexer::layout l(source); while (!l.finished()) { dhc::lexer::match_ptr token (l.next()); if (token) { std::cout << token->flatten() << ' '; } else { std::cerr << filename << std::endl; } } std::cout << std::endl; dhc::parser::parser p(source); std::cout << "Created parser" << std::endl; if (!p.finished()) { dhc::lexer::match_ptr token (p.parse()); if (token) { print_tree(token, 0); } else { std::cerr << p.error(filename) << std::endl; } } return 0; }