C++ (Cpp) ucsdet_setText Examples

Example #1

0

Show file

File: ComponentStringServiceImpl.cpp Project: nitric1/EACRipper

		IERServiceStringConverter *CharsetDetector::detect(const char *str)
		{
			const char *name;
			if((str[0] != '\0' && str[0] == '\xFF') && (str[1] != '\0' && str[1] == '\xFE'))
				name = "UTF-16LE";
			else if((str[0] != '\0' && str[0] == '\xFE') && (str[1] != '\0' && str[1] == '\xFF'))
				name = "UTF-16BE";
			else
			{
				UErrorCode err = U_ZERO_ERROR;
				ucsdet_setText(cd, str, -1, &err);
				if(U_FAILURE(err))
					return nullptr;
				const UCharsetMatch *cm = ucsdet_detect(cd, &err);
				if(U_FAILURE(err))
					return nullptr;
				name = ucsdet_getName(cm, &err);
				if(U_FAILURE(err))
					return nullptr;
			}

			StringCharsetConverter *c = new StringCharsetConverter();
			if(!c->setCharset(name))
			{
				delete c;
				return nullptr;
			}

			return ServicePointerManager::instance().append<IERServiceStringConverter>(c);
		}

Example #2

0

Show file

File: ircsession.cpp Project: BackupTheBerlios/quazaa-svn

static QByteArray detectEncoding(const QByteArray& text)
{
    Q_UNUSED(text);
    QByteArray encoding;
#ifdef HAVE_ICU
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector* detector = ucsdet_open(&status);
    if (detector && !U_FAILURE(status))
    {
        ucsdet_setText(detector, text.constData(), text.length(), &status);
        if (!U_FAILURE(status))
        {
            const UCharsetMatch* match = ucsdet_detect(detector, &status);
            if (match && !U_FAILURE(status))
                encoding = ucsdet_getName(match, &status);
        }
    }

    if (U_FAILURE(status)) {
        qWarning("detectEncoding() failed: %s", u_errorName(status));
    }

    ucsdet_close(detector);
#endif // HAVE_ICU
    return encoding;
}

Example #3

0

Show file

File: mcharsetdetector.cpp Project: amtep/libmlocale

void MCharsetDetector::setText(const QByteArray &ba)
{
    Q_D(MCharsetDetector);
    clearError();
    d->_ba = ba;
    d->_baExtended = ba;
    if (!ba.isEmpty())
        while (d->_baExtended.size() < 50)
            d->_baExtended += d->_ba;
    ucsdet_setText(d->_uCharsetDetector, d->_baExtended.constData(), int32_t(-1), &(d->_status));
    if(hasError())
        qWarning() << __PRETTY_FUNCTION__ << errorString();
}

Example #4

0

Show file

File: ext_icu_ucsdet.cpp Project: Alienfeel/hhvm

void c_EncodingDetector::t_settext(const String& text) {
  UErrorCode status = U_ZERO_ERROR;
  m_text = text;
  ucsdet_setText(
    m_encoding_detector,
    m_text.data(),
    m_text.length(),
    &status);
  if (U_FAILURE(status)) {
    throw Exception(
      "Could not set encoding detector text to [%s], error %d (%s)",
      text.c_str(), status, u_errorName(status));
  }
}

Example #5

0

Show file

File: ext_icu_ucsdet.cpp Project: d1saster/hiphop-php

void c_EncodingDetector::t_settext(CStrRef text) {
  INSTANCE_METHOD_INJECTION_BUILTIN(EncodingDetector, EncodingDetector::settext);
  UErrorCode status = U_ZERO_ERROR;
  ucsdet_setText(
    m_encoding_detector,
    text.data(),
    text.length(),
    &status);
  if (U_FAILURE(status)) {
    throw Exception(
      "Could not set encoding detector text to [%s], error %d (%s)",
      text.c_str(), status, u_errorName(status));
  }
}

Example #6

0

Show file

File: uchardet.c Project: sraach/uchardet

static void
set_text(VALUE self, VALUE text)
{
    if (!NIL_P(text)) {
        text = StringValue(text);
        
        UErrorCode status = U_ZERO_ERROR;
        UCharsetDetector *detector;
        Data_Get_Struct(self, UCharsetDetector, detector);
        
        ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
        ensure(status);
        
        UCharsetDetector_set_text(self, text);
    }
}

Example #7

0

Show file

File: ircmessagedecoder_icu.cpp Project: PopRe/PopMM-Mobile

QByteArray IrcMessageDecoder::codecForData(const QByteArray &data) const
{
    QByteArray encoding;
    UErrorCode status = U_ZERO_ERROR;
    if (d.detector) {
        ucsdet_setText(UCSD(d.detector), data.constData(), data.length(), &status);
        if (!U_FAILURE(status)) {
            const UCharsetMatch* match = ucsdet_detect(UCSD(d.detector), &status);
            if (match && !U_FAILURE(status))
                encoding = ucsdet_getName(match, &status);
        }
    }
    if (U_FAILURE(status))
        qWarning("IrcMessageDecoder::codecForData() failed: %s", u_errorName(status));
    return encoding;
}

Example #8

0

Show file

File: encoding_detector.c Project: kkszysiu/pycharlockholmes

/*
 * Attempt to detect the encoding of this string
 *
 * str      - a String, what you want to detect the encoding of
 * hint_enc - an optional String (like "UTF-8"), the encoding name which will
 *            be used as an additional hint to the charset detector
 *
 * Returns: a dict with encoding, language, type and confidence parameters
 */
PyObject *
charlockholmes_encoding_detect(PyObject *self, PyObject *args)
{
    PyObject *content;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch *match;
    const char *mname;
    const char *mlang;
    const char *hint_enc = NULL;
    int mconfidence;

    if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) {
        return NULL;
    }

    if (detect_binary_content(content)) {
        return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100);
    }

    if (hint_enc != NULL) {
        ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status);
    }

    ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status);
    match = ucsdet_detect(ch_ucd, &status);
    if (match) {
        mname = ucsdet_getName(match, &status);
        mlang = ucsdet_getLanguage(match, &status);
        mconfidence = ucsdet_getConfidence(match, &status);
        if (mlang && mlang[0])
            return Py_BuildValue("{ss,ss,si,ss}",
                    "type", "text",
                    "encoding", mname,
                    "confidence", mconfidence,
                    "language", mlang);
        else
            return Py_BuildValue("{ss,ss,si}",
                    "type", "text",
                    "encoding", mname,
                    "confidence", mconfidence);
    }

    Py_INCREF(Py_None);
    return Py_None;
}

Example #9

0

Show file

File: tracker-encoding-libicu.c Project: UIKit0/tracker

gchar *
tracker_encoding_guess_icu (const gchar *buffer,
			    gsize        size)
{
	UCharsetDetector *detector = NULL;
	const UCharsetMatch *match;
	gchar *charset = NULL;
	UErrorCode status;

	detector = ucsdet_open (&status);

	if (U_FAILURE (status))
		goto failure;

	if (size >= G_MAXINT32)
		goto failure;

	ucsdet_setText (detector, buffer, (int32_t) size, &status);

	if (U_FAILURE (status))
		goto failure;

	match = ucsdet_detect (detector, &status);

	if (U_FAILURE (status))
		goto failure;

	charset = g_strdup (ucsdet_getName (match, &status));

	if (U_FAILURE (status)) {
		g_free (charset);
		charset = NULL;
	}

	if (charset)
		g_debug ("Guessing charset as '%s'", charset);

failure:
	if (detector)
		ucsdet_close (detector);

	return charset;
}

Example #10

0

Show file

File: lml-charset.c Project: Prelude-SIEM/prelude-lml

static int _charset_detect(const char *in, size_t len, const char **charset, int *confidence)
{
    UCharsetDetector *csd;
    const UCharsetMatch *ucm;
    UErrorCode status = U_ZERO_ERROR;

    csd = ucsdet_open(&status);
    if ( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error opening character set detector: %s.\n", u_errorName(status));
        return -1;
    }

    ucsdet_setText(csd, in, len, &status);
    if( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error setting text for character set detection: %s.\n", u_errorName(status));
        goto error;
    }

    ucm = ucsdet_detect(csd, &status);
    if( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: character set detection failed: %s.\n", u_errorName(status));
        goto error;
    }

    *confidence = ucsdet_getConfidence(ucm, &status);
    if ( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set confidence: %s.\n", u_errorName(status));
        goto error;
    }

    *charset = ucsdet_getName(ucm, &status);
    if ( U_FAILURE(status) ) {
        prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set name: %s.\n", u_errorName(status));
        goto error;
    }

    return 0;

error:
    ucsdet_close(csd);
    return -1;
}

Example #11

0

Show file

File: qcharsetdetector.cpp Project: qt-labs/messagingframework

void QCharsetDetector::setText(const QByteArray &ba)
{
    Q_D(QCharsetDetector);
    clearError();
    d->_ba = ba;
    d->_baExtended = ba;
    if (!ba.isEmpty()) {
        while (d->_baExtended.size() < 50)
            d->_baExtended += d->_ba;
    } else { // ba is empty, possibly null.
        d->_ba = "";
        d->_baExtended = "";
    }
    // Workaround for libicu bug, it seems to sometimes read past end of input buffer by one byte
    // This was causing messageserver to abnormally terminate when running in valgrind
    d->_baExtended.append(char(0));
    
    ucsdet_setText(d->_uCharsetDetector, d->_baExtended.constData(), int32_t(-1), &(d->_status));
    if(hasError())
        qWarning() << __PRETTY_FUNCTION__ << errorString();
}

Example #12

0

Show file

File: mod_websocket_conv.c Project: Fomich/mod_websocket

mod_websocket_bool_t
mod_websocket_conv_isUTF8(const char *data, size_t siz) {
    mod_websocket_bool_t ret = MOD_WEBSOCKET_FALSE;
    UErrorCode err = U_ZERO_ERROR;
    UCharsetDetector *detector = NULL;
    const UCharsetMatch **match;
    int32_t f = 0, i;
    const char *name;

    if (!data || !siz) {
        return MOD_WEBSOCKET_TRUE;
    }
    if (siz > INT32_MAX) {
        return MOD_WEBSOCKET_FALSE;
    }
    detector = ucsdet_open(&err);
    if (U_FAILURE(err)) {
        return MOD_WEBSOCKET_FALSE;
    }
    ucsdet_setText(detector, data, siz, &err);
    if (U_FAILURE(err)) {
        goto go_out;
    }
    match = ucsdet_detectAll(detector, &f, &err);
    if (U_FAILURE(err)) {
        goto go_out;
    }
    for (i = 0; i < f; i++) {
        name = ucsdet_getName(match[i], &err);
        if (strcasecmp(MOD_WEBSOCKET_UTF8_STR, name) == 0) {
            ret = MOD_WEBSOCKET_TRUE;
            break;
        }
    }

 go_out:
    ucsdet_close(detector);
    detector = NULL;
    return ret;
}

Example #13

0

Show file

File: reader_util.cpp Project: fdelapena/easyrpg-liblcf

std::vector<std::string> ReaderUtil::DetectEncodings(const std::string& database_file) {
	std::vector<std::string> encodings;
#ifdef LCF_SUPPORT_ICU
	std::ostringstream text;

	// Populate Data::terms and Data::system or will empty by default even if load fails
	LDB_Reader::Load(database_file, "");

	text <<
	Data::terms.menu_save <<
	Data::terms.menu_quit <<
	Data::terms.new_game <<
	Data::terms.load_game <<
	Data::terms.exit_game <<
	Data::terms.status <<
	Data::terms.row <<
	Data::terms.order <<
	Data::terms.wait_on <<
	Data::terms.wait_off <<
	Data::terms.level <<
	Data::terms.health_points <<
	Data::terms.spirit_points <<
	Data::terms.normal_status <<
	Data::terms.exp_short <<
	Data::terms.lvl_short <<
	Data::terms.hp_short <<
	Data::terms.sp_short <<
	Data::terms.sp_cost <<
	Data::terms.attack <<
	Data::terms.defense <<
	Data::terms.spirit <<
	Data::terms.agility <<
	Data::terms.weapon <<
	Data::terms.shield <<
	Data::terms.armor <<
	Data::terms.helmet <<
	Data::terms.accessory <<
	Data::terms.save_game_message <<
	Data::terms.load_game_message <<
	Data::terms.file <<
	Data::terms.exit_game_message <<
	Data::terms.yes <<
	Data::terms.no <<
	Data::system.boat_name <<
	Data::system.ship_name <<
	Data::system.airship_name <<
	Data::system.title_name <<
	Data::system.gameover_name <<
	Data::system.system_name <<
	Data::system.system2_name <<
	Data::system.battletest_background <<
	Data::system.frame_name;

	if (!text.str().empty()) {
		UErrorCode status = U_ZERO_ERROR;
		UCharsetDetector* detector = ucsdet_open(&status);

		std::string s = text.str();
		ucsdet_setText(detector, s.c_str(), s.length(), &status);

		int32_t matches_count;
		const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);

		if (matches != NULL) {
			// Collect all candidates, most confident comes first
			for (int i = 0; i < matches_count; ++i) {
				std::string encoding = ucsdet_getName(matches[i], &status);

				// Fixes to ensure proper Windows encodings
				if (encoding == "Shift_JIS") {
					encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
				} else if (encoding == "EUC-KR") {
					encodings.push_back("windows-949-2000"); // Korean with \ as backlash
				} else if (encoding == "GB18030") {
					encodings.push_back("windows-936-2000"); // Simplified Chinese
				} else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
					encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
				} else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
					encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
				} else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
					encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
				} else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
					encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
				} else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
					encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
				} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
					encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
				} else {
					encodings.push_back(encoding);
				}
			}
		}
		ucsdet_close(detector);
	}
#endif

	return encodings;
}

Example #14

0

Show file

File: TextEncodingDetectorICU.cpp Project: Channely/know-your-chrome

bool detectTextEncoding(const char* data, size_t len,
                        const char* hintEncodingName,
                        TextEncoding* detectedEncoding)
{
    *detectedEncoding = TextEncoding();
    int matchesCount = 0; 
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector* detector = ucsdet_open(&status);
    if (U_FAILURE(status))
        return false;
    ucsdet_enableInputFilter(detector, true);
    ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); 
    if (U_FAILURE(status))
        return false;

    // FIXME: A few things we can do other than improving
    // the ICU detector itself. 
    // 1. Use ucsdet_detectAll and pick the most likely one given
    // "the context" (parent-encoding, referrer encoding, etc).
    // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
    // Chinese, Japanese, Russian, Korean and Hebrew) by picking the 
    // encoding with a highest confidence among the detector-specific
    // limited set of candidate encodings.
    // Below is a partial implementation of the first part of what's outlined
    // above.
    const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
    if (U_FAILURE(status)) {
        ucsdet_close(detector);
        return false;
    }

    const char* encoding = 0;
    if (hintEncodingName) {
        TextEncoding hintEncoding(hintEncodingName);
        // 10 is the minimum confidence value consistent with the codepoint
        // allocation in a given encoding. The size of a chunk passed to
        // us varies even for the same html file (apparently depending on 
        // the network load). When we're given a rather short chunk, we 
        // don't have a sufficiently reliable signal other than the fact that
        // the chunk is consistent with a set of encodings. So, instead of
        // setting an arbitrary threshold, we have to scan all the encodings
        // consistent with the data.  
        const int32_t kThresold = 10;
        for (int i = 0; i < matchesCount; ++i) {
            int32_t confidence = ucsdet_getConfidence(matches[i], &status);
            if (U_FAILURE(status)) {
                status = U_ZERO_ERROR;
                continue;
            }
            if (confidence < kThresold)
                break;
            const char* matchEncoding = ucsdet_getName(matches[i], &status);
            if (U_FAILURE(status)) {
                status = U_ZERO_ERROR;
                continue;
            }
            if (TextEncoding(matchEncoding) == hintEncoding) {
                encoding = hintEncodingName;
                break;
            }
        }
    }
    // If no match is found so far, just pick the top match. 
    // This can happen, say, when a parent frame in EUC-JP refers to
    // a child frame in Shift_JIS and both frames do NOT specify the encoding
    // making us resort to auto-detection (when it IS turned on).
    if (!encoding && matchesCount > 0)
        encoding = ucsdet_getName(matches[0], &status);
    if (U_SUCCESS(status)) {
        *detectedEncoding = TextEncoding(encoding);
        ucsdet_close(detector);
        return true;
    }    
    ucsdet_close(detector);
    return false;
}

Example #15

0

Show file

File: reader_util.cpp Project: BigRabbit6/liblcf

std::string ReaderUtil::DetectEncoding(const std::string& database_file) {
	std::string encoding;
#ifdef LCF_SUPPORT_ICU
	std::ostringstream text;

	//Populate Data::terms or will empty by default even if load fails
	LDB_Reader::Load(database_file, "");

	text <<
	Data::terms.menu_save << " " <<
	Data::terms.menu_quit << " " <<
	Data::terms.new_game << " " <<
	Data::terms.load_game << " " <<
	Data::terms.exit_game << " " <<
	Data::terms.status << " " <<
	Data::terms.row << " " <<
	Data::terms.order << " " <<
	Data::terms.wait_on << " " <<
	Data::terms.wait_off << " " <<
	Data::terms.level << " " <<
	Data::terms.health_points << " " <<
	Data::terms.spirit_points << " " <<
	Data::terms.normal_status << " " <<
	Data::terms.exp_short << " " <<
	Data::terms.lvl_short << " " <<
	Data::terms.hp_short << " " <<
	Data::terms.sp_short << " " <<
	Data::terms.sp_cost << " " <<
	Data::terms.attack << " " <<
	Data::terms.defense << " " <<
	Data::terms.spirit << " " <<
	Data::terms.agility << " " <<
	Data::terms.weapon << " " <<
	Data::terms.shield << " " <<
	Data::terms.armor << " " <<
	Data::terms.helmet << " " <<
	Data::terms.accessory << " " <<
	Data::terms.save_game_message << " " <<
	Data::terms.load_game_message << " " <<
	Data::terms.file << " " <<
	Data::terms.exit_game_message << " " <<
	Data::terms.yes << " " <<
	Data::terms.no;

	// Checks if there are more than the above 33 spaces (no data)
	if (text.str().size() > 33)
	{
		UErrorCode status = U_ZERO_ERROR;
		UCharsetDetector* detector = ucsdet_open(&status);

		std::string s = text.str();
		ucsdet_setText(detector, s.c_str(), s.length(), &status);

		const UCharsetMatch* match = ucsdet_detect(detector, &status);
		if (match != NULL)
		{
			encoding = ucsdet_getName(match, &status);
		}
		ucsdet_close(detector);

		// Fixes to ensure proper Windows encodings
		if (encoding == "Shift_JIS")
		{
			encoding = "ibm-943_P130-1999"; // Japanese with Yen backslash
		}
		else if (encoding == "EUC-KR")
		{
			encoding = "ibm-949_P110-1999"; // Korean with Won backslash
		}
		else if (encoding == "ISO-8859-1" || encoding == "windows-1252")
		{
			encoding = "ibm-5348_P100-1997"; // Occidental with Euro
		}
		else if (encoding == "ISO-8859-2" || encoding == "windows-1250")
		{
			encoding = "ibm-5346_P100-1998"; // Central Europe with Euro
		}
		else if (encoding == "ISO-8859-5" || encoding == "windows-1251")
		{
			encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro
		}
		else if (encoding == "ISO-8859-6" || encoding == "windows-1256")
		{
			encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars
		}
		else if (encoding == "ISO-8859-7" || encoding == "windows-1253")
		{
			encoding = "ibm-5349_P100-1998"; // Greek with Euro
		}
		else if (encoding == "ISO-8859-8" || encoding == "windows-1255")
		{
			encoding = "ibm-9447_P100-2002"; // Hebrew with Euro
		}
	}
#endif

	return encoding;
}

Example #16

0

Show file

File: parser_main.cpp Project: ThatOtherPerson/dhc

int main(int argc, char** argv)
{
    UErrorCode e = U_ZERO_ERROR;
    std::string filename = argc > 1 ? argv[1] : "main.hs";

    std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate);
    if (!file.is_open()) {
        std::cerr << "I can't open that file. I hate you too." << std::endl;
        return 1;
    }

    std::string raw;
    raw.reserve(file.tellg());

    file.seekg(0, std::ios::beg);
    raw.assign((std::istreambuf_iterator<char>(file)),
                 std::istreambuf_iterator<char>());

    file.close();

    UCharsetDetector *ucd = ucsdet_open(&e);

    ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e);
    ucsdet_setText(ucd, raw.c_str(), raw.size(), &e);
    const UCharsetMatch *ucm = ucsdet_detect(ucd, &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }

    std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl;
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }
    
    UChar *buf = new UChar[raw.size() + 1];

    int out = ucsdet_getUChars(ucm, buf, raw.size(), &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl;
        return e;
    }

    ucsdet_close(ucd);

    buf[out] = 0;

    icu::UnicodeString source(buf);
    delete [] buf;

    source.append("\n");
    std::cout << "Read:" << std::endl << source << std::endl;

    dhc::lexer::layout l(source);

    while (!l.finished()) {
        dhc::lexer::match_ptr token (l.next());

        if (token) {
            std::cout << token->flatten() << ' ';
        } else {
            std::cerr << filename << std::endl;
        }
    }

    std::cout << std::endl;

    dhc::parser::parser p(source);

    std::cout << "Created parser" << std::endl;

    if (!p.finished()) {
        dhc::lexer::match_ptr token (p.parse());

        if (token) {
            print_tree(token, 0);
        } else {
            std::cerr << p.error(filename) << std::endl;
        }
    }

    return 0;
}

Example #17

0

Show file

File: encoding_detector.c Project: kkszysiu/pycharlockholmes

/*
 * Attempt to detect the encoding of this string, and return
 * a list with all the possible encodings that match it.
 *
 *
 * str      - a String, what you want to detect the encoding of
 * hint_enc - an optional String (like "UTF-8"), the encoding name which will
 *            be used as an additional hint to the charset detector
 *
 * Returns: an list with zero or more dicts
 *          each one of them with with encoding, language, type and confidence
 *          parameters
 */
PyObject *
charlockholmes_encoding_detect_all(PyObject *self, PyObject *args)
{
    PyObject *lst;
    PyObject *content;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch **matches;
    const char *mname;
    const char *mlang;
    const char *hint_enc = NULL;
    int mconfidence;
    int i, match_count;

    if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) {
        return NULL;
    }

    if (detect_binary_content(content)) {
        lst = PyList_New(1);
        if (!lst)
            return NULL;

        content = Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100);
        PyList_SET_ITEM(lst, 0, content);
        return lst;
    }

    if (hint_enc != NULL) {
        ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status);
    }

    ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status);
    matches = ucsdet_detectAll(ch_ucd, &match_count, &status);

    if (matches) {
        lst = PyList_New(match_count);
        if (!lst)
            return NULL;

    	for (i = 0; i < match_count; ++i) {
            mname = ucsdet_getName(matches[i], &status);
            mlang = ucsdet_getLanguage(matches[i], &status);
            mconfidence = ucsdet_getConfidence(matches[i], &status);
            if (mlang && mlang[0])
                content = Py_BuildValue("{ss,ss,si,ss}",
                        "type", "text",
                        "encoding", mname,
                        "confidence", mconfidence,
                        "language", mlang);
            else
                content = Py_BuildValue("{ss,ss,si}",
                        "type", "text",
                        "encoding", mname,
                        "confidence", mconfidence);

            PyList_SET_ITEM(lst, i, content);
        }

        return lst;
    }

    Py_INCREF(Py_None);
    return Py_None;
}

Example #18

0

Show file

File: pg_chardetect.c Project: aweber/pg_chardetect

UErrorCode
detect_ICU(const text* buffer, text** encoding, text** lang, int32_t* confidence)
{
    const char* cbuffer = text_to_cstring(buffer);
    //int cbuffer_len = strlen(cbuffer);

    UCharsetDetector* csd;
    const UCharsetMatch* csm;
    UErrorCode status = U_ZERO_ERROR;

    csd = ucsdet_open(&status);

    // set text buffer
    // use -1 for string length since NUL terminated
    ucsdet_setText(csd, cbuffer, STRING_IS_NULL_TERMINATED, &status);
    //ucsdet_setText(csd, cbuffer, cbuffer_len, &status);

    // detect charset
    csm = ucsdet_detect(csd, &status);

    // charset match is NULL if no match
    if (NULL == csm)
    {
        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("ICU error: No charset match for \"%s\" - assuming ISO-8859-1.", cbuffer)));

        *encoding = cstring_to_text("ISO-8859-1");
        *lang = NULL;
        *confidence = 0;

        ucsdet_close(csd);
        pfree((void *) cbuffer);
        return status;
    }
    else if (U_FAILURE(status))
    {
        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("ICU error: %s\n", u_errorName(status))));

        *encoding = NULL;
        *lang = NULL;
        *confidence = 0;

        ucsdet_close(csd);
        pfree((void *) cbuffer);
        return status;
    }

    *encoding = cstring_to_text(ucsdet_getName(csm, &status));
    *lang = cstring_to_text(ucsdet_getLanguage(csm, &status));
    *confidence = ucsdet_getConfidence(csm, &status);

    // close charset detector
    // UCharsetMatch is owned by UCharsetDetector so its memory will be
    // freed when the char set detector is closed
    ucsdet_close(csd);
    pfree((void *) cbuffer);
    return status;
}