Ejemplo n.º 1
0
/*
 * call-seq:
 *   detect(text=nil, declared_encoding=nil)
 *
 * Return the charset that best matches the supplied input data.
 * 
 * Note though, that because the detection 
 * only looks at the start of the input data,
 * there is a possibility that the returned charset will fail to handle
 * the full set of input data.
 * 
 * The function will fail if
 * * no charset appears to match the data
 * * no input text has been provided (with +text+ or set with #text= )
 */
static VALUE
UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    set_text(self, text);
    set_declared_encoding(self, declared_encoding);
    
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    const UCharsetMatch *match = ucsdet_detect(detector, &status);
    ensure(status);
        
    const char *encoding_name = ucsdet_getName(match, &status);
    ensure(status);

    int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
    ensure(status);
        
    const char *encoding_language = ucsdet_getLanguage(match, &status);
    ensure(status);
        
    VALUE hash = rb_hash_new();
    rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
    rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
    rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
    
    return hash;
}
Ejemplo n.º 2
0
static QByteArray detectEncoding(const QByteArray& text)
{
    Q_UNUSED(text);
    QByteArray encoding;
#ifdef HAVE_ICU
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector* detector = ucsdet_open(&status);
    if (detector && !U_FAILURE(status))
    {
        ucsdet_setText(detector, text.constData(), text.length(), &status);
        if (!U_FAILURE(status))
        {
            const UCharsetMatch* match = ucsdet_detect(detector, &status);
            if (match && !U_FAILURE(status))
                encoding = ucsdet_getName(match, &status);
        }
    }

    if (U_FAILURE(status)) {
        qWarning("detectEncoding() failed: %s", u_errorName(status));
    }

    ucsdet_close(detector);
#endif // HAVE_ICU
    return encoding;
}
		IERServiceStringConverter *CharsetDetector::detect(const char *str)
		{
			const char *name;
			if((str[0] != '\0' && str[0] == '\xFF') && (str[1] != '\0' && str[1] == '\xFE'))
				name = "UTF-16LE";
			else if((str[0] != '\0' && str[0] == '\xFE') && (str[1] != '\0' && str[1] == '\xFF'))
				name = "UTF-16BE";
			else
			{
				UErrorCode err = U_ZERO_ERROR;
				ucsdet_setText(cd, str, -1, &err);
				if(U_FAILURE(err))
					return nullptr;
				const UCharsetMatch *cm = ucsdet_detect(cd, &err);
				if(U_FAILURE(err))
					return nullptr;
				name = ucsdet_getName(cm, &err);
				if(U_FAILURE(err))
					return nullptr;
			}

			StringCharsetConverter *c = new StringCharsetConverter();
			if(!c->setCharset(name))
			{
				delete c;
				return nullptr;
			}

			return ServicePointerManager::instance().append<IERServiceStringConverter>(c);
		}
Ejemplo n.º 4
0
static Object HHVM_METHOD(EncodingDetector, detect) {
  FETCH_DET(data, this_);
  UErrorCode error = U_ZERO_ERROR;
  auto match = ucsdet_detect(data->detector(), &error);
  if (U_FAILURE(error)) {
    data->throwException("Could not detect encoding, error %d (%s)",
                         error, u_errorName(error));
  }
  return EncodingMatch::newInstance(match);
}
Ejemplo n.º 5
0
Object c_EncodingDetector::t_detect() {
  UErrorCode status = U_ZERO_ERROR;
  const UCharsetMatch* match = ucsdet_detect(
    m_encoding_detector,
    &status);
  if (U_FAILURE(status)) {
    throw Exception(
      "Could not detect encoding, error %d (%s)", status, u_errorName(status));
  }

  p_EncodingMatch matchobj = NEWOBJ(c_EncodingMatch)();
  matchobj->m_encoding_match = match;
  return matchobj;
}
Ejemplo n.º 6
0
QByteArray IrcMessageDecoder::codecForData(const QByteArray &data) const
{
    QByteArray encoding;
    UErrorCode status = U_ZERO_ERROR;
    if (d.detector) {
        ucsdet_setText(UCSD(d.detector), data.constData(), data.length(), &status);
        if (!U_FAILURE(status)) {
            const UCharsetMatch* match = ucsdet_detect(UCSD(d.detector), &status);
            if (match && !U_FAILURE(status))
                encoding = ucsdet_getName(match, &status);
        }
    }
    if (U_FAILURE(status))
        qWarning("IrcMessageDecoder::codecForData() failed: %s", u_errorName(status));
    return encoding;
}
Ejemplo n.º 7
0
/*
 * Attempt to detect the encoding of this string
 *
 * str      - a String, what you want to detect the encoding of
 * hint_enc - an optional String (like "UTF-8"), the encoding name which will
 *            be used as an additional hint to the charset detector
 *
 * Returns: a dict with encoding, language, type and confidence parameters
 */
PyObject *
charlockholmes_encoding_detect(PyObject *self, PyObject *args)
{
    PyObject *content;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch *match;
    const char *mname;
    const char *mlang;
    const char *hint_enc = NULL;
    int mconfidence;

    if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) {
        return NULL;
    }

    if (detect_binary_content(content)) {
        return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100);
    }

    if (hint_enc != NULL) {
        ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status);
    }

    ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status);
    match = ucsdet_detect(ch_ucd, &status);
    if (match) {
        mname = ucsdet_getName(match, &status);
        mlang = ucsdet_getLanguage(match, &status);
        mconfidence = ucsdet_getConfidence(match, &status);
        if (mlang && mlang[0])
            return Py_BuildValue("{ss,ss,si,ss}",
                    "type", "text",
                    "encoding", mname,
                    "confidence", mconfidence,
                    "language", mlang);
        else
            return Py_BuildValue("{ss,ss,si}",
                    "type", "text",
                    "encoding", mname,
                    "confidence", mconfidence);
    }

    Py_INCREF(Py_None);
    return Py_None;
}
Ejemplo n.º 8
0
gchar *
tracker_encoding_guess_icu (const gchar *buffer,
			    gsize        size)
{
	UCharsetDetector *detector = NULL;
	const UCharsetMatch *match;
	gchar *charset = NULL;
	UErrorCode status;

	detector = ucsdet_open (&status);

	if (U_FAILURE (status))
		goto failure;

	if (size >= G_MAXINT32)
		goto failure;

	ucsdet_setText (detector, buffer, (int32_t) size, &status);

	if (U_FAILURE (status))
		goto failure;

	match = ucsdet_detect (detector, &status);

	if (U_FAILURE (status))
		goto failure;

	charset = g_strdup (ucsdet_getName (match, &status));

	if (U_FAILURE (status)) {
		g_free (charset);
		charset = NULL;
	}

	if (charset)
		g_debug ("Guessing charset as '%s'", charset);

failure:
	if (detector)
		ucsdet_close (detector);

	return charset;
}
Ejemplo n.º 9
0
static int _charset_detect(const char *in, size_t len, const char **charset, int *confidence)
{
    UCharsetDetector *csd;
    const UCharsetMatch *ucm;
    UErrorCode status = U_ZERO_ERROR;

    csd = ucsdet_open(&status);
    if ( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error opening character set detector: %s.\n", u_errorName(status));
        return -1;
    }

    ucsdet_setText(csd, in, len, &status);
    if( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error setting text for character set detection: %s.\n", u_errorName(status));
        goto error;
    }

    ucm = ucsdet_detect(csd, &status);
    if( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: character set detection failed: %s.\n", u_errorName(status));
        goto error;
    }

    *confidence = ucsdet_getConfidence(ucm, &status);
    if ( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set confidence: %s.\n", u_errorName(status));
        goto error;
    }

    *charset = ucsdet_getName(ucm, &status);
    if ( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set name: %s.\n", u_errorName(status));
        goto error;
    }

    return 0;

error:
    ucsdet_close(csd);
    return -1;
}
Ejemplo n.º 10
0
std::string ReaderUtil::DetectEncoding(const std::string& database_file) {
	std::string encoding;
#ifdef LCF_SUPPORT_ICU
	std::ostringstream text;

	//Populate Data::terms or will empty by default even if load fails
	LDB_Reader::Load(database_file, "");

	text <<
	Data::terms.menu_save << " " <<
	Data::terms.menu_quit << " " <<
	Data::terms.new_game << " " <<
	Data::terms.load_game << " " <<
	Data::terms.exit_game << " " <<
	Data::terms.status << " " <<
	Data::terms.row << " " <<
	Data::terms.order << " " <<
	Data::terms.wait_on << " " <<
	Data::terms.wait_off << " " <<
	Data::terms.level << " " <<
	Data::terms.health_points << " " <<
	Data::terms.spirit_points << " " <<
	Data::terms.normal_status << " " <<
	Data::terms.exp_short << " " <<
	Data::terms.lvl_short << " " <<
	Data::terms.hp_short << " " <<
	Data::terms.sp_short << " " <<
	Data::terms.sp_cost << " " <<
	Data::terms.attack << " " <<
	Data::terms.defense << " " <<
	Data::terms.spirit << " " <<
	Data::terms.agility << " " <<
	Data::terms.weapon << " " <<
	Data::terms.shield << " " <<
	Data::terms.armor << " " <<
	Data::terms.helmet << " " <<
	Data::terms.accessory << " " <<
	Data::terms.save_game_message << " " <<
	Data::terms.load_game_message << " " <<
	Data::terms.file << " " <<
	Data::terms.exit_game_message << " " <<
	Data::terms.yes << " " <<
	Data::terms.no;

	// Checks if there are more than the above 33 spaces (no data)
	if (text.str().size() > 33)
	{
		UErrorCode status = U_ZERO_ERROR;
		UCharsetDetector* detector = ucsdet_open(&status);

		std::string s = text.str();
		ucsdet_setText(detector, s.c_str(), s.length(), &status);

		const UCharsetMatch* match = ucsdet_detect(detector, &status);
		if (match != NULL)
		{
			encoding = ucsdet_getName(match, &status);
		}
		ucsdet_close(detector);

		// Fixes to ensure proper Windows encodings
		if (encoding == "Shift_JIS")
		{
			encoding = "ibm-943_P130-1999"; // Japanese with Yen backslash
		}
		else if (encoding == "EUC-KR")
		{
			encoding = "ibm-949_P110-1999"; // Korean with Won backslash
		}
		else if (encoding == "ISO-8859-1" || encoding == "windows-1252")
		{
			encoding = "ibm-5348_P100-1997"; // Occidental with Euro
		}
		else if (encoding == "ISO-8859-2" || encoding == "windows-1250")
		{
			encoding = "ibm-5346_P100-1998"; // Central Europe with Euro
		}
		else if (encoding == "ISO-8859-5" || encoding == "windows-1251")
		{
			encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro
		}
		else if (encoding == "ISO-8859-6" || encoding == "windows-1256")
		{
			encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars
		}
		else if (encoding == "ISO-8859-7" || encoding == "windows-1253")
		{
			encoding = "ibm-5349_P100-1998"; // Greek with Euro
		}
		else if (encoding == "ISO-8859-8" || encoding == "windows-1255")
		{
			encoding = "ibm-9447_P100-2002"; // Hebrew with Euro
		}
	}
#endif

	return encoding;
}
Ejemplo n.º 11
0
UErrorCode
detect_ICU(const text* buffer, text** encoding, text** lang, int32_t* confidence)
{
    const char* cbuffer = text_to_cstring(buffer);
    //int cbuffer_len = strlen(cbuffer);

    UCharsetDetector* csd;
    const UCharsetMatch* csm;
    UErrorCode status = U_ZERO_ERROR;

    csd = ucsdet_open(&status);

    // set text buffer
    // use -1 for string length since NUL terminated
    ucsdet_setText(csd, cbuffer, STRING_IS_NULL_TERMINATED, &status);
    //ucsdet_setText(csd, cbuffer, cbuffer_len, &status);

    // detect charset
    csm = ucsdet_detect(csd, &status);

    // charset match is NULL if no match
    if (NULL == csm)
    {
        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("ICU error: No charset match for \"%s\" - assuming ISO-8859-1.", cbuffer)));

        *encoding = cstring_to_text("ISO-8859-1");
        *lang = NULL;
        *confidence = 0;

        ucsdet_close(csd);
        pfree((void *) cbuffer);
        return status;
    }
    else if (U_FAILURE(status))
    {
        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("ICU error: %s\n", u_errorName(status))));

        *encoding = NULL;
        *lang = NULL;
        *confidence = 0;

        ucsdet_close(csd);
        pfree((void *) cbuffer);
        return status;
    }

    *encoding = cstring_to_text(ucsdet_getName(csm, &status));
    *lang = cstring_to_text(ucsdet_getLanguage(csm, &status));
    *confidence = ucsdet_getConfidence(csm, &status);

    // close charset detector
    // UCharsetMatch is owned by UCharsetDetector so its memory will be
    // freed when the char set detector is closed
    ucsdet_close(csd);
    pfree((void *) cbuffer);
    return status;
}
Ejemplo n.º 12
0
int main(int argc, char** argv)
{
    UErrorCode e = U_ZERO_ERROR;
    std::string filename = argc > 1 ? argv[1] : "main.hs";

    std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate);
    if (!file.is_open()) {
        std::cerr << "I can't open that file. I hate you too." << std::endl;
        return 1;
    }

    std::string raw;
    raw.reserve(file.tellg());

    file.seekg(0, std::ios::beg);
    raw.assign((std::istreambuf_iterator<char>(file)),
                 std::istreambuf_iterator<char>());

    file.close();

    UCharsetDetector *ucd = ucsdet_open(&e);

    ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e);
    ucsdet_setText(ucd, raw.c_str(), raw.size(), &e);
    const UCharsetMatch *ucm = ucsdet_detect(ucd, &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }

    std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl;
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }
    
    UChar *buf = new UChar[raw.size() + 1];

    int out = ucsdet_getUChars(ucm, buf, raw.size(), &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl;
        return e;
    }

    ucsdet_close(ucd);

    buf[out] = 0;

    icu::UnicodeString source(buf);
    delete [] buf;

    source.append("\n");
    std::cout << "Read:" << std::endl << source << std::endl;

    dhc::lexer::layout l(source);

    while (!l.finished()) {
        dhc::lexer::match_ptr token (l.next());

        if (token) {
            std::cout << token->flatten() << ' ';
        } else {
            std::cerr << filename << std::endl;
        }
    }

    std::cout << std::endl;

    dhc::parser::parser p(source);

    std::cout << "Created parser" << std::endl;

    if (!p.finished()) {
        dhc::lexer::match_ptr token (p.parse());

        if (token) {
            print_tree(token, 0);
        } else {
            std::cerr << p.error(filename) << std::endl;
        }
    }

    return 0;
}